datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/__init__.py +0 -1
- datamarket/exceptions/__init__.py +1 -0
- datamarket/exceptions/main.py +118 -0
- datamarket/interfaces/alchemy.py +1934 -25
- datamarket/interfaces/aws.py +81 -14
- datamarket/interfaces/azure.py +127 -0
- datamarket/interfaces/drive.py +60 -10
- datamarket/interfaces/ftp.py +37 -14
- datamarket/interfaces/llm.py +1220 -0
- datamarket/interfaces/nominatim.py +314 -42
- datamarket/interfaces/peerdb.py +272 -104
- datamarket/interfaces/proxy.py +354 -50
- datamarket/interfaces/tinybird.py +7 -15
- datamarket/params/nominatim.py +439 -0
- datamarket/utils/__init__.py +1 -1
- datamarket/utils/airflow.py +10 -7
- datamarket/utils/alchemy.py +2 -1
- datamarket/utils/logs.py +88 -0
- datamarket/utils/main.py +138 -10
- datamarket/utils/nominatim.py +201 -0
- datamarket/utils/playwright/__init__.py +0 -0
- datamarket/utils/playwright/async_api.py +274 -0
- datamarket/utils/playwright/sync_api.py +281 -0
- datamarket/utils/requests.py +655 -0
- datamarket/utils/selenium.py +6 -12
- datamarket/utils/strings/__init__.py +1 -0
- datamarket/utils/strings/normalization.py +217 -0
- datamarket/utils/strings/obfuscation.py +153 -0
- datamarket/utils/strings/standardization.py +40 -0
- datamarket/utils/typer.py +2 -1
- datamarket/utils/types.py +1 -0
- datamarket-0.10.3.dist-info/METADATA +172 -0
- datamarket-0.10.3.dist-info/RECORD +38 -0
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
- datamarket-0.6.0.dist-info/METADATA +0 -49
- datamarket-0.6.0.dist-info/RECORD +0 -24
- datamarket-0.6.0.dist-info/top_level.txt +0 -1
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,655 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
from contextlib import suppress
|
|
6
|
+
from datetime import timedelta
|
|
7
|
+
from email.utils import parsedate_to_datetime
|
|
8
|
+
from http.cookies import SimpleCookie
|
|
9
|
+
from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
|
|
10
|
+
from urllib.parse import urlparse
|
|
11
|
+
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
13
|
+
from requests.cookies import RequestsCookieJar, create_cookie
|
|
14
|
+
from requests.exceptions import HTTPError
|
|
15
|
+
from rnet import Emulation, Proxy
|
|
16
|
+
from rnet.blocking import Client
|
|
17
|
+
from rnet.blocking import Response as RnetResponse
|
|
18
|
+
from rnet.exceptions import ConnectionError, TimeoutError, TlsError
|
|
19
|
+
from rnet.header import OrigHeaderMap
|
|
20
|
+
from tenacity import (
|
|
21
|
+
before_sleep_log,
|
|
22
|
+
retry,
|
|
23
|
+
retry_if_exception_type,
|
|
24
|
+
retry_if_not_exception_type,
|
|
25
|
+
stop_after_attempt,
|
|
26
|
+
stop_after_delay,
|
|
27
|
+
wait_exponential,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
from datamarket.exceptions.main import IgnoredHTTPError
|
|
31
|
+
|
|
32
|
+
from ..exceptions import BadRequestError, EmptyResponseError, NotFoundError, RedirectionDetectedError
|
|
33
|
+
from ..interfaces.proxy import ProxyInterface
|
|
34
|
+
from .main import ban_sleep
|
|
35
|
+
|
|
36
|
+
########################################################################################################################
|
|
37
|
+
# SETUP LOGGER
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
########################################################################################################################
|
|
42
|
+
# CLASSES
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class RnetRequestAdapter:
|
|
46
|
+
"""Adapter class for converting requests-style kwargs to rnet kwargs."""
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def _validate_supported_kwargs(requests_kwargs: Mapping[str, Any], supported: set) -> None:
|
|
50
|
+
"""Validate that all kwargs are in the supported set."""
|
|
51
|
+
for key in requests_kwargs:
|
|
52
|
+
if key not in supported:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"The parameter '{key}' exists in requests but "
|
|
55
|
+
f"is NOT supported by RNET. Remove it or add an explicit mapping."
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _stringify_mapping(mapping: Mapping[Any, Any]) -> Dict[str, str]:
|
|
60
|
+
"""Helper to ensure strict string conversion for keys and values."""
|
|
61
|
+
return {str(k): str(v) for k, v in mapping.items()}
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def _normalize_headers(value: Any) -> Dict[str, str]:
|
|
65
|
+
"""Convert headers to a clean string dictionary."""
|
|
66
|
+
if isinstance(value, Mapping):
|
|
67
|
+
return RnetRequestAdapter._stringify_mapping(value)
|
|
68
|
+
try:
|
|
69
|
+
return RnetRequestAdapter._stringify_mapping(dict(value))
|
|
70
|
+
except (TypeError, ValueError) as e:
|
|
71
|
+
raise TypeError(f"Unsupported type for 'headers': {type(value)!r}") from e
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def _build_orig_header_map(clean_headers: Dict[str, str]) -> Optional[OrigHeaderMap]:
|
|
75
|
+
"""Build OrigHeaderMap from clean headers to preserve order and casing."""
|
|
76
|
+
if not hasattr(OrigHeaderMap, "insert"):
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
header_map = OrigHeaderMap()
|
|
80
|
+
for k in clean_headers:
|
|
81
|
+
header_map.insert(k)
|
|
82
|
+
return header_map
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def _map_headers(value: Any) -> Dict[str, Any]:
|
|
86
|
+
"""Map headers parameter to rnet kwargs (headers and orig_headers)."""
|
|
87
|
+
rnet_kwargs = {}
|
|
88
|
+
clean_headers = RnetRequestAdapter._normalize_headers(value)
|
|
89
|
+
rnet_kwargs["headers"] = clean_headers
|
|
90
|
+
|
|
91
|
+
header_map = RnetRequestAdapter._build_orig_header_map(clean_headers)
|
|
92
|
+
if header_map is not None:
|
|
93
|
+
rnet_kwargs["orig_headers"] = header_map
|
|
94
|
+
|
|
95
|
+
return rnet_kwargs
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def _normalize_timeout(value: Any) -> Any:
|
|
99
|
+
"""
|
|
100
|
+
Normalize timeout value to int or None.
|
|
101
|
+
|
|
102
|
+
WARNING: rnet does not support separate connect and read timeouts.
|
|
103
|
+
If a tuple is provided (connect, read), only the connect timeout is used
|
|
104
|
+
as the TOTAL timeout for the request.
|
|
105
|
+
"""
|
|
106
|
+
if isinstance(value, (int, float)):
|
|
107
|
+
return int(value)
|
|
108
|
+
|
|
109
|
+
if isinstance(value, tuple) and len(value) == 2:
|
|
110
|
+
connect_timeout = value[0]
|
|
111
|
+
read_timeout = value[1]
|
|
112
|
+
|
|
113
|
+
# We use the connect_timeout as the total timeout to respect the stricter constraint,
|
|
114
|
+
# but this may cause the read phase to timeout prematurely.
|
|
115
|
+
if connect_timeout is not None and read_timeout is not None:
|
|
116
|
+
logger.warning(
|
|
117
|
+
f"RNET LIMITATION: Separate connect/read timeouts are not supported (received {value}). "
|
|
118
|
+
f"Using the connect timeout ({connect_timeout}s) as the TOTAL timeout. "
|
|
119
|
+
f"The read timeout ({read_timeout}s) is IGNORED."
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
return int(connect_timeout) if connect_timeout is not None else None
|
|
123
|
+
|
|
124
|
+
return value
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _map_direct_mappings(requests_kwargs: Mapping[str, Any]) -> Dict[str, Any]:
|
|
128
|
+
"""Map direct mappings: headers, timeout, allow_redirects, and verify."""
|
|
129
|
+
rnet_kwargs = {}
|
|
130
|
+
direct_map = {
|
|
131
|
+
"headers": "headers",
|
|
132
|
+
"timeout": "timeout",
|
|
133
|
+
"allow_redirects": "allow_redirects",
|
|
134
|
+
"verify": "verify",
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
for src in ["headers", "timeout", "allow_redirects", "verify"]:
|
|
138
|
+
if src in requests_kwargs and requests_kwargs[src] is not None:
|
|
139
|
+
value = requests_kwargs[src]
|
|
140
|
+
dst = direct_map[src]
|
|
141
|
+
|
|
142
|
+
if src == "headers":
|
|
143
|
+
rnet_kwargs.update(RnetRequestAdapter._map_headers(value))
|
|
144
|
+
elif src == "timeout":
|
|
145
|
+
rnet_kwargs[dst] = RnetRequestAdapter._normalize_timeout(value)
|
|
146
|
+
else:
|
|
147
|
+
rnet_kwargs[dst] = value
|
|
148
|
+
|
|
149
|
+
return rnet_kwargs
|
|
150
|
+
|
|
151
|
+
@staticmethod
|
|
152
|
+
def _map_query(requests_kwargs: Mapping[str, Any]) -> Dict[str, Any]:
|
|
153
|
+
"""Map params to query."""
|
|
154
|
+
rnet_kwargs = {}
|
|
155
|
+
params = requests_kwargs.get("params")
|
|
156
|
+
|
|
157
|
+
if params is None:
|
|
158
|
+
return rnet_kwargs
|
|
159
|
+
|
|
160
|
+
if isinstance(params, Mapping):
|
|
161
|
+
rnet_kwargs["query"] = RnetRequestAdapter._stringify_mapping(params)
|
|
162
|
+
return rnet_kwargs
|
|
163
|
+
|
|
164
|
+
if not isinstance(params, (str, bytes, bytearray)):
|
|
165
|
+
with suppress(TypeError, ValueError):
|
|
166
|
+
rnet_kwargs["query"] = [(str(k), str(v)) for k, v in params]
|
|
167
|
+
return rnet_kwargs
|
|
168
|
+
|
|
169
|
+
raise TypeError(
|
|
170
|
+
"Unsupported format for 'params'. Expected a mapping or an iterable of "
|
|
171
|
+
"(key, value) pairs (e.g. [('a', 1), ('b', 2)]). "
|
|
172
|
+
f"Got type {type(params)!r}."
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def _map_body_and_files(requests_kwargs: Mapping[str, Any]) -> Dict[str, Any]:
|
|
177
|
+
"""Map json, data, and files to appropriate rnet fields."""
|
|
178
|
+
rnet_kwargs = {}
|
|
179
|
+
|
|
180
|
+
json_data = requests_kwargs.get("json")
|
|
181
|
+
if json_data is not None:
|
|
182
|
+
if not isinstance(json_data, Mapping):
|
|
183
|
+
raise TypeError("Rnet 'json' expects a dict-like object.")
|
|
184
|
+
rnet_kwargs["json"] = dict(json_data)
|
|
185
|
+
|
|
186
|
+
data = requests_kwargs.get("data")
|
|
187
|
+
if data is not None:
|
|
188
|
+
if isinstance(data, Mapping):
|
|
189
|
+
rnet_kwargs["form"] = RnetRequestAdapter._stringify_mapping(data)
|
|
190
|
+
|
|
191
|
+
elif not isinstance(data, (str, bytes, bytearray)):
|
|
192
|
+
with suppress(TypeError, ValueError):
|
|
193
|
+
rnet_kwargs["form"] = [(str(k), str(v)) for k, v in data]
|
|
194
|
+
return rnet_kwargs
|
|
195
|
+
|
|
196
|
+
raise TypeError(
|
|
197
|
+
"Unsupported format for 'data'. Expected a mapping or an iterable of "
|
|
198
|
+
"(key, value) pairs (e.g. [('a', 1), ('b', 2)]). "
|
|
199
|
+
f"Got type {type(data)!r}."
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
else:
|
|
203
|
+
rnet_kwargs["body"] = data
|
|
204
|
+
|
|
205
|
+
if requests_kwargs.get("files") is not None:
|
|
206
|
+
raise NotImplementedError("Mapping 'files' -> Rnet 'multipart' is not implemented yet.")
|
|
207
|
+
|
|
208
|
+
return rnet_kwargs
|
|
209
|
+
|
|
210
|
+
@staticmethod
|
|
211
|
+
def _map_auth(requests_kwargs: Mapping[str, Any]) -> Dict[str, Any]:
|
|
212
|
+
"""Map auth to basic_auth or auth."""
|
|
213
|
+
rnet_kwargs = {}
|
|
214
|
+
auth = requests_kwargs.get("auth")
|
|
215
|
+
if auth:
|
|
216
|
+
if isinstance(auth, tuple) and len(auth) == 2:
|
|
217
|
+
user, pwd = auth
|
|
218
|
+
rnet_kwargs["basic_auth"] = (str(user), None if pwd is None else str(pwd))
|
|
219
|
+
else:
|
|
220
|
+
if not isinstance(auth, str):
|
|
221
|
+
raise TypeError("Rnet 'auth' only supports string values (e.g. 'user:pass').")
|
|
222
|
+
rnet_kwargs["auth"] = auth
|
|
223
|
+
return rnet_kwargs
|
|
224
|
+
|
|
225
|
+
@staticmethod
|
|
226
|
+
def _map_proxy(requests_kwargs: Mapping[str, Any]) -> Dict[str, Any]:
|
|
227
|
+
"""Map requests 'proxies' (dict o Proxy) a rnet 'proxy'."""
|
|
228
|
+
rnet_kwargs: Dict[str, Any] = {}
|
|
229
|
+
|
|
230
|
+
proxies = requests_kwargs.get("proxies")
|
|
231
|
+
if proxies is None:
|
|
232
|
+
return rnet_kwargs
|
|
233
|
+
|
|
234
|
+
if isinstance(proxies, Mapping):
|
|
235
|
+
url = proxies.get("https") or proxies.get("http")
|
|
236
|
+
if url is None:
|
|
237
|
+
raise ValueError("No suitable proxy URL found in 'proxies' dict")
|
|
238
|
+
|
|
239
|
+
rnet_kwargs["proxy"] = Proxy(url)
|
|
240
|
+
else:
|
|
241
|
+
rnet_kwargs["proxy"] = proxies
|
|
242
|
+
|
|
243
|
+
return rnet_kwargs
|
|
244
|
+
|
|
245
|
+
@staticmethod
|
|
246
|
+
def _map_cookies(requests_kwargs: Mapping[str, Any]) -> Dict[str, str]:
|
|
247
|
+
"""Convert 'cookies' arg to a Cookie header string."""
|
|
248
|
+
cookies = requests_kwargs.get("cookies")
|
|
249
|
+
if not cookies:
|
|
250
|
+
return {}
|
|
251
|
+
|
|
252
|
+
cookie_list = []
|
|
253
|
+
if isinstance(cookies, Mapping):
|
|
254
|
+
for k, v in cookies.items():
|
|
255
|
+
cookie_list.append(f"{k}={v}")
|
|
256
|
+
else:
|
|
257
|
+
# Assume it's a CookieJar or iterable
|
|
258
|
+
try:
|
|
259
|
+
for c in cookies:
|
|
260
|
+
# RequestsCookieJar yields cookies, specific cookie objects, or sometimes keys
|
|
261
|
+
# depending on iteration. Safe access via name/value attributes.
|
|
262
|
+
if hasattr(c, "name") and hasattr(c, "value"):
|
|
263
|
+
cookie_list.append(f"{c.name}={c.value}")
|
|
264
|
+
elif isinstance(c, tuple) and len(c) == 2:
|
|
265
|
+
cookie_list.append(f"{c[0]}={c[1]}")
|
|
266
|
+
except TypeError:
|
|
267
|
+
pass
|
|
268
|
+
|
|
269
|
+
if not cookie_list:
|
|
270
|
+
return {}
|
|
271
|
+
|
|
272
|
+
return {"Cookie": "; ".join(cookie_list)}
|
|
273
|
+
|
|
274
|
+
@staticmethod
|
|
275
|
+
def requests_to_rnet_kwargs(**requests_kwargs: Any) -> Dict[str, Any]:
|
|
276
|
+
supported = {
|
|
277
|
+
"headers",
|
|
278
|
+
"timeout",
|
|
279
|
+
"allow_redirects",
|
|
280
|
+
"params",
|
|
281
|
+
"json",
|
|
282
|
+
"data",
|
|
283
|
+
"files",
|
|
284
|
+
"auth",
|
|
285
|
+
"proxies",
|
|
286
|
+
"cookies",
|
|
287
|
+
"verify",
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
RnetRequestAdapter._validate_supported_kwargs(requests_kwargs, supported)
|
|
291
|
+
|
|
292
|
+
rnet_kwargs: Dict[str, Any] = {}
|
|
293
|
+
rnet_kwargs.update(RnetRequestAdapter._map_direct_mappings(requests_kwargs))
|
|
294
|
+
rnet_kwargs.update(RnetRequestAdapter._map_query(requests_kwargs))
|
|
295
|
+
rnet_kwargs.update(RnetRequestAdapter._map_body_and_files(requests_kwargs))
|
|
296
|
+
rnet_kwargs.update(RnetRequestAdapter._map_auth(requests_kwargs))
|
|
297
|
+
rnet_kwargs.update(RnetRequestAdapter._map_proxy(requests_kwargs))
|
|
298
|
+
|
|
299
|
+
# Handle Cookies: Convert to header and merge into existing headers
|
|
300
|
+
cookie_header = RnetRequestAdapter._map_cookies(requests_kwargs)
|
|
301
|
+
if cookie_header:
|
|
302
|
+
if "headers" not in rnet_kwargs:
|
|
303
|
+
rnet_kwargs["headers"] = {}
|
|
304
|
+
|
|
305
|
+
# Merge logic: if Cookie exists, append; otherwise set.
|
|
306
|
+
existing_key = next((k for k in rnet_kwargs["headers"] if k.lower() == "cookie"), None)
|
|
307
|
+
if existing_key:
|
|
308
|
+
rnet_kwargs["headers"][existing_key] = (
|
|
309
|
+
f"{rnet_kwargs['headers'][existing_key]}; {cookie_header['Cookie']}"
|
|
310
|
+
)
|
|
311
|
+
else:
|
|
312
|
+
rnet_kwargs["headers"]["Cookie"] = cookie_header["Cookie"]
|
|
313
|
+
|
|
314
|
+
return rnet_kwargs
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
class RequestsCompatibleResponse:
|
|
318
|
+
"""
|
|
319
|
+
A wrapper around rnet Response that provides backward compatibility with requests.Response API.
|
|
320
|
+
"""
|
|
321
|
+
|
|
322
|
+
def __init__(self, rnet_response: RnetResponse):
|
|
323
|
+
self._rnet_response = rnet_response
|
|
324
|
+
|
|
325
|
+
@property
|
|
326
|
+
def text(self) -> str:
|
|
327
|
+
return self._rnet_response.text()
|
|
328
|
+
|
|
329
|
+
@property
|
|
330
|
+
def content(self) -> bytes:
|
|
331
|
+
return self._rnet_response.bytes()
|
|
332
|
+
|
|
333
|
+
@property
|
|
334
|
+
def status_code(self) -> int:
|
|
335
|
+
return self._rnet_response.status.as_int()
|
|
336
|
+
|
|
337
|
+
@property
|
|
338
|
+
def headers(self) -> Dict[str, str]:
|
|
339
|
+
headers = {}
|
|
340
|
+
for key, value in self._rnet_response.headers:
|
|
341
|
+
key_str = key.decode("utf-8") if isinstance(key, bytes) else str(key)
|
|
342
|
+
value_str = value.decode("utf-8") if isinstance(value, bytes) else str(value)
|
|
343
|
+
headers[key_str] = value_str
|
|
344
|
+
return headers
|
|
345
|
+
|
|
346
|
+
@property
|
|
347
|
+
def url(self) -> str:
|
|
348
|
+
return str(self._rnet_response.url)
|
|
349
|
+
|
|
350
|
+
@property
|
|
351
|
+
def ok(self) -> bool:
|
|
352
|
+
return self._rnet_response.status.is_success()
|
|
353
|
+
|
|
354
|
+
@property
|
|
355
|
+
def cookies(self) -> RequestsCookieJar:
|
|
356
|
+
jar = RequestsCookieJar()
|
|
357
|
+
|
|
358
|
+
raw = getattr(self._rnet_response, "cookies", None)
|
|
359
|
+
if raw is not None:
|
|
360
|
+
try:
|
|
361
|
+
items = raw.items() if hasattr(raw, "items") else raw
|
|
362
|
+
for k, v in items:
|
|
363
|
+
jar.set(k, v)
|
|
364
|
+
except (TypeError, ValueError):
|
|
365
|
+
pass
|
|
366
|
+
|
|
367
|
+
host = urlparse(self.url).hostname
|
|
368
|
+
for k, v in self._rnet_response.headers:
|
|
369
|
+
key = k.decode("utf-8") if isinstance(k, bytes) else str(k)
|
|
370
|
+
if key.lower() == "set-cookie":
|
|
371
|
+
val = v.decode("utf-8") if isinstance(v, bytes) else str(v)
|
|
372
|
+
sc = SimpleCookie()
|
|
373
|
+
sc.load(val)
|
|
374
|
+
for name, morsel in sc.items():
|
|
375
|
+
raw = morsel["expires"]
|
|
376
|
+
try:
|
|
377
|
+
expires = (
|
|
378
|
+
int(raw)
|
|
379
|
+
if raw and raw.isdigit()
|
|
380
|
+
else (int(parsedate_to_datetime(raw).timestamp()) if raw else None)
|
|
381
|
+
)
|
|
382
|
+
except (ValueError, TypeError, AttributeError):
|
|
383
|
+
expires = None
|
|
384
|
+
|
|
385
|
+
ck = create_cookie(
|
|
386
|
+
name=name,
|
|
387
|
+
value=morsel.value,
|
|
388
|
+
domain=morsel["domain"] or host,
|
|
389
|
+
path=morsel["path"] or "/",
|
|
390
|
+
secure=bool(morsel["secure"]),
|
|
391
|
+
expires=expires,
|
|
392
|
+
rest={"HttpOnly": morsel["httponly"]} if morsel["httponly"] else None,
|
|
393
|
+
)
|
|
394
|
+
jar.set_cookie(ck)
|
|
395
|
+
|
|
396
|
+
return jar
|
|
397
|
+
|
|
398
|
+
def raise_for_status(self) -> None:
|
|
399
|
+
if not self._rnet_response.status.is_success():
|
|
400
|
+
status_code = self._rnet_response.status.as_int()
|
|
401
|
+
url = str(self._rnet_response.url)
|
|
402
|
+
error = HTTPError(f"HTTP {status_code} error for {url}")
|
|
403
|
+
error.response = self
|
|
404
|
+
raise error
|
|
405
|
+
|
|
406
|
+
def bytes(self) -> bytes:
|
|
407
|
+
return self._rnet_response.bytes()
|
|
408
|
+
|
|
409
|
+
def json(self) -> Any:
|
|
410
|
+
return self._rnet_response.json()
|
|
411
|
+
|
|
412
|
+
@property
|
|
413
|
+
def status(self):
|
|
414
|
+
return self._rnet_response.status
|
|
415
|
+
|
|
416
|
+
def __getattr__(self, name):
|
|
417
|
+
return getattr(self._rnet_response, name)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
class RequestsClient:
|
|
421
|
+
"""A robust, proxy-enabled HTTP client with retry logic and flexible output formats."""
|
|
422
|
+
|
|
423
|
+
# 1. FORBIDDEN HEADERS:
|
|
424
|
+
# We strip these entirely from user input. This forces rnet to generate them
|
|
425
|
+
# based on the selected Emulation (e.g., Firefox143).
|
|
426
|
+
MANAGED_HEADERS_TO_STRIP = {
|
|
427
|
+
"user-agent",
|
|
428
|
+
"connection",
|
|
429
|
+
"dnt",
|
|
430
|
+
"pragma",
|
|
431
|
+
"cache-control",
|
|
432
|
+
"upgrade-insecure-requests",
|
|
433
|
+
"priority",
|
|
434
|
+
"sec-ch-ua",
|
|
435
|
+
"sec-ch-ua-mobile",
|
|
436
|
+
"sec-ch-ua-platform",
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
# 2. OVERRIDE HEADERS:
|
|
440
|
+
# If the user provides these, we must use them EXACTLY as provided.
|
|
441
|
+
CRITICAL_OVERRIDE_HEADERS = {
|
|
442
|
+
"accept",
|
|
443
|
+
"accept-language",
|
|
444
|
+
"accept-encoding",
|
|
445
|
+
"sec-fetch-dest",
|
|
446
|
+
"sec-fetch-mode",
|
|
447
|
+
"sec-fetch-site",
|
|
448
|
+
"sec-fetch-user",
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
_REDIRECT_STATUS_CODES = set(range(300, 309))
|
|
452
|
+
|
|
453
|
+
def __init__(self, proxy_interface: Optional[ProxyInterface] = None):
|
|
454
|
+
self.proxy_interface = proxy_interface
|
|
455
|
+
# Default client for general use
|
|
456
|
+
self.client = Client(
|
|
457
|
+
emulation=Emulation.Firefox143,
|
|
458
|
+
cookie_store=True,
|
|
459
|
+
allow_redirects=True,
|
|
460
|
+
max_redirects=10,
|
|
461
|
+
)
|
|
462
|
+
# Cache for specialized clients
|
|
463
|
+
self._client_cache: Dict[Tuple[Tuple[str, str], ...], Client] = {}
|
|
464
|
+
|
|
465
|
+
def _get_cached_client(self, headers: Dict[str, str], allow_redirects: bool) -> Client:
|
|
466
|
+
"""
|
|
467
|
+
Retrieves a cached Client instance or creates a new one if the specific
|
|
468
|
+
header/redirect configuration hasn't been seen before.
|
|
469
|
+
"""
|
|
470
|
+
# Convert headers dict to a sorted tuple of items to make it hashable
|
|
471
|
+
# e.g., (('accept', 'application/json'), ('accept-language', 'en-US'))
|
|
472
|
+
headers_key = tuple(sorted(headers.items()))
|
|
473
|
+
|
|
474
|
+
# storage key includes headers and the allow_redirects flag
|
|
475
|
+
cache_key = (headers_key, allow_redirects)
|
|
476
|
+
|
|
477
|
+
if cache_key not in self._client_cache:
|
|
478
|
+
# specific logging to track when we actually incur the cost of creation
|
|
479
|
+
logger.debug(f"Initializing new rnet Client for specific headers: {headers.keys()}")
|
|
480
|
+
|
|
481
|
+
# Optional: Simple guard to prevent memory leaks if headers are randomized per request
|
|
482
|
+
if len(self._client_cache) > 50:
|
|
483
|
+
self._client_cache.clear()
|
|
484
|
+
|
|
485
|
+
self._client_cache[cache_key] = Client(
|
|
486
|
+
emulation=Emulation.Firefox143,
|
|
487
|
+
cookie_store=True,
|
|
488
|
+
allow_redirects=allow_redirects,
|
|
489
|
+
max_redirects=10,
|
|
490
|
+
headers=headers,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
return self._client_cache[cache_key]
|
|
494
|
+
|
|
495
|
+
def _process_headers(self, headers: Dict[str, Any]) -> Tuple[Dict[str, str], Dict[str, str]]:
|
|
496
|
+
"""
|
|
497
|
+
Splits headers into client_init_headers and request_headers.
|
|
498
|
+
"""
|
|
499
|
+
if not headers:
|
|
500
|
+
return {}, {}
|
|
501
|
+
|
|
502
|
+
client_init_headers = {}
|
|
503
|
+
request_headers = {}
|
|
504
|
+
|
|
505
|
+
for key, value in headers.items():
|
|
506
|
+
key_lower = str(key).lower()
|
|
507
|
+
|
|
508
|
+
if key_lower in self.MANAGED_HEADERS_TO_STRIP:
|
|
509
|
+
continue
|
|
510
|
+
|
|
511
|
+
if key_lower in self.CRITICAL_OVERRIDE_HEADERS:
|
|
512
|
+
client_init_headers[key] = value
|
|
513
|
+
continue
|
|
514
|
+
|
|
515
|
+
request_headers[key] = value
|
|
516
|
+
|
|
517
|
+
return client_init_headers, request_headers
|
|
518
|
+
|
|
519
|
+
@retry(
|
|
520
|
+
retry=retry_if_exception_type((TlsError, TimeoutError, ConnectionError)),
|
|
521
|
+
wait=wait_exponential(exp_base=3, multiplier=3, max=60),
|
|
522
|
+
stop=stop_after_delay(timedelta(minutes=10)),
|
|
523
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
524
|
+
reraise=True,
|
|
525
|
+
)
|
|
526
|
+
def _request_with_proxy_retry(self, url: str, method: str, use_auth: bool, **params):
|
|
527
|
+
logger.info(f"Fetching data from {url} ...")
|
|
528
|
+
|
|
529
|
+
proxy_obj = None
|
|
530
|
+
if self.proxy_interface:
|
|
531
|
+
host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=use_auth)
|
|
532
|
+
if host and port:
|
|
533
|
+
proxy_url = f"http://{host}:{port}"
|
|
534
|
+
proxy_obj = Proxy.all(proxy_url, username=user, password=pwd) if user and pwd else Proxy.all(proxy_url)
|
|
535
|
+
logger.info(f"Using proxy: {host}:{port}")
|
|
536
|
+
|
|
537
|
+
request_params = params.copy()
|
|
538
|
+
|
|
539
|
+
if proxy_obj:
|
|
540
|
+
request_params["proxies"] = proxy_obj
|
|
541
|
+
|
|
542
|
+
client_init_headers = {}
|
|
543
|
+
|
|
544
|
+
if "headers" in request_params:
|
|
545
|
+
client_init_headers, request_method_headers = self._process_headers(request_params["headers"])
|
|
546
|
+
request_params["headers"] = request_method_headers
|
|
547
|
+
|
|
548
|
+
if client_init_headers:
|
|
549
|
+
active_client = self._get_cached_client(
|
|
550
|
+
headers=client_init_headers, allow_redirects=request_params.get("allow_redirects", True)
|
|
551
|
+
)
|
|
552
|
+
else:
|
|
553
|
+
active_client = self.client
|
|
554
|
+
|
|
555
|
+
# Convert args (including cookies) to rnet format
|
|
556
|
+
rnet_params = RnetRequestAdapter.requests_to_rnet_kwargs(**request_params)
|
|
557
|
+
|
|
558
|
+
rnet_response = getattr(active_client, method.lower())(url, **rnet_params)
|
|
559
|
+
|
|
560
|
+
return RequestsCompatibleResponse(rnet_response)
|
|
561
|
+
|
|
562
|
+
def _handle_http_error(self, status_code: int, url: str, response, allow_redirects: bool) -> None:
|
|
563
|
+
"""
|
|
564
|
+
Handle HTTP errors with special handling for redirects when allow_redirects is False.
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
status_code: HTTP status code
|
|
568
|
+
url: Request URL
|
|
569
|
+
response: Response object
|
|
570
|
+
allow_redirects: Whether redirects are allowed
|
|
571
|
+
|
|
572
|
+
Raises:
|
|
573
|
+
RedirectionDetectedError: If a redirect status is received and allow_redirects is False
|
|
574
|
+
NotFoundError: For 404/410 errors
|
|
575
|
+
BadRequestError: For 400 errors
|
|
576
|
+
HTTPError: For other non-2xx status codes
|
|
577
|
+
"""
|
|
578
|
+
# Check for redirect status codes when redirects are disabled
|
|
579
|
+
|
|
580
|
+
if not allow_redirects and status_code in self._REDIRECT_STATUS_CODES:
|
|
581
|
+
raise RedirectionDetectedError(
|
|
582
|
+
message=f"HTTP {status_code} redirect detected but allow_redirects is False for {url}",
|
|
583
|
+
response=response,
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
# Standard error handlers
|
|
587
|
+
error_handlers = {
|
|
588
|
+
404: lambda: NotFoundError(message=f"404 Not Found error for {url}", response=response),
|
|
589
|
+
410: lambda: NotFoundError(message=f"410 Gone error for {url}", response=response),
|
|
590
|
+
400: lambda: BadRequestError(message=f"400 Bad Request error for {url}", response=response),
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
if status_code in error_handlers:
|
|
594
|
+
raise error_handlers[status_code]()
|
|
595
|
+
|
|
596
|
+
# Raise for any other non-2xx status
|
|
597
|
+
response.raise_for_status()
|
|
598
|
+
|
|
599
|
+
@retry(
|
|
600
|
+
retry=retry_if_not_exception_type((NotFoundError, BadRequestError, RedirectionDetectedError, IgnoredHTTPError)),
|
|
601
|
+
wait=wait_exponential(exp_base=3, multiplier=3, max=60),
|
|
602
|
+
stop=stop_after_attempt(5),
|
|
603
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
604
|
+
reraise=True,
|
|
605
|
+
)
|
|
606
|
+
def get_data(
|
|
607
|
+
self,
|
|
608
|
+
url: str,
|
|
609
|
+
method: str = "GET",
|
|
610
|
+
output: str = "json",
|
|
611
|
+
sleep: tuple = (6, 3),
|
|
612
|
+
use_auth_proxies: bool = False,
|
|
613
|
+
max_proxy_delay: timedelta = timedelta(minutes=10),
|
|
614
|
+
ignored_status_codes: Sequence[int] = (),
|
|
615
|
+
**kwargs,
|
|
616
|
+
):
|
|
617
|
+
params = kwargs.copy()
|
|
618
|
+
|
|
619
|
+
if "timeout" not in params and "read_timeout" not in params:
|
|
620
|
+
params["timeout"] = timedelta(seconds=30)
|
|
621
|
+
else:
|
|
622
|
+
params["timeout"] = timedelta(params["timeout"])
|
|
623
|
+
|
|
624
|
+
r = self._request_with_proxy_retry.retry_with(stop=stop_after_delay(max_proxy_delay))(
|
|
625
|
+
self, url, method, use_auth_proxies, **params
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
ban_sleep(*sleep)
|
|
629
|
+
|
|
630
|
+
status_code = r.status_code
|
|
631
|
+
|
|
632
|
+
if status_code in ignored_status_codes:
|
|
633
|
+
raise IgnoredHTTPError(message=f"Status {status_code} in ignored_status_codes for URL {url}", response=r)
|
|
634
|
+
|
|
635
|
+
# Check if allow_redirects is explicitly set in params, default to True
|
|
636
|
+
allow_redirects = params.get("allow_redirects", True)
|
|
637
|
+
|
|
638
|
+
# Handle HTTP errors with redirect detection
|
|
639
|
+
self._handle_http_error(status_code, url, r, allow_redirects)
|
|
640
|
+
|
|
641
|
+
response_content = r.content
|
|
642
|
+
if not response_content:
|
|
643
|
+
raise EmptyResponseError(message=f"Empty response received from {url} (status {status_code})", response=r)
|
|
644
|
+
|
|
645
|
+
output_format = {
|
|
646
|
+
"json": lambda: r.json(),
|
|
647
|
+
"text": lambda: r.text,
|
|
648
|
+
"soup": lambda: BeautifulSoup(response_content, "html.parser"),
|
|
649
|
+
"response": lambda: r,
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
if output in output_format:
|
|
653
|
+
return output_format[output]()
|
|
654
|
+
|
|
655
|
+
raise ValueError(f"Unsupported output format: {output}")
|
datamarket/utils/selenium.py
CHANGED
|
@@ -17,9 +17,7 @@ logger = logging.getLogger(__name__)
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def get_chromedriver_version():
|
|
20
|
-
return int(
|
|
21
|
-
run_bash_command("/usr/bin/google-chrome --version").split(" ")[2].split(".")[0]
|
|
22
|
-
)
|
|
20
|
+
return int(run_bash_command("/usr/bin/google-chrome --version").split(" ")[2].split(".")[0])
|
|
23
21
|
|
|
24
22
|
|
|
25
23
|
def get_driver(chrome_options=None, **kwargs):
|
|
@@ -38,23 +36,19 @@ def get_driver(chrome_options=None, **kwargs):
|
|
|
38
36
|
|
|
39
37
|
def wait(driver, css_selector, timeout=30):
|
|
40
38
|
logger.info(f"waiting for {css_selector}...")
|
|
41
|
-
return WebDriverWait(driver, timeout).until(
|
|
42
|
-
EC.visibility_of_element_located(("css selector", css_selector))
|
|
43
|
-
)
|
|
39
|
+
return WebDriverWait(driver, timeout).until(EC.visibility_of_element_located(("css selector", css_selector)))
|
|
44
40
|
|
|
45
41
|
|
|
46
42
|
def wait_and_click(driver, css_selector, timeout=30):
|
|
47
43
|
logger.info(f"clicking on {css_selector}...")
|
|
48
|
-
WebDriverWait(driver, timeout).until(
|
|
49
|
-
EC.element_to_be_clickable(("css selector", css_selector))
|
|
50
|
-
).click()
|
|
44
|
+
WebDriverWait(driver, timeout).until(EC.element_to_be_clickable(("css selector", css_selector))).click()
|
|
51
45
|
|
|
52
46
|
|
|
53
47
|
def wait_and_fill(driver, css_selector, text_to_fill, timeout=30):
|
|
54
48
|
logger.info(f"sending text to {css_selector}...")
|
|
55
|
-
WebDriverWait(driver, timeout).until(
|
|
56
|
-
|
|
57
|
-
)
|
|
49
|
+
WebDriverWait(driver, timeout).until(EC.presence_of_element_located(("css selector", css_selector))).send_keys(
|
|
50
|
+
text_to_fill
|
|
51
|
+
)
|
|
58
52
|
|
|
59
53
|
|
|
60
54
|
def scroll(driver, css_selector):
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .normalization import * # noqa: F403
|