datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (38) hide show
  1. datamarket/__init__.py +0 -1
  2. datamarket/exceptions/__init__.py +1 -0
  3. datamarket/exceptions/main.py +118 -0
  4. datamarket/interfaces/alchemy.py +1934 -25
  5. datamarket/interfaces/aws.py +81 -14
  6. datamarket/interfaces/azure.py +127 -0
  7. datamarket/interfaces/drive.py +60 -10
  8. datamarket/interfaces/ftp.py +37 -14
  9. datamarket/interfaces/llm.py +1220 -0
  10. datamarket/interfaces/nominatim.py +314 -42
  11. datamarket/interfaces/peerdb.py +272 -104
  12. datamarket/interfaces/proxy.py +354 -50
  13. datamarket/interfaces/tinybird.py +7 -15
  14. datamarket/params/nominatim.py +439 -0
  15. datamarket/utils/__init__.py +1 -1
  16. datamarket/utils/airflow.py +10 -7
  17. datamarket/utils/alchemy.py +2 -1
  18. datamarket/utils/logs.py +88 -0
  19. datamarket/utils/main.py +138 -10
  20. datamarket/utils/nominatim.py +201 -0
  21. datamarket/utils/playwright/__init__.py +0 -0
  22. datamarket/utils/playwright/async_api.py +274 -0
  23. datamarket/utils/playwright/sync_api.py +281 -0
  24. datamarket/utils/requests.py +655 -0
  25. datamarket/utils/selenium.py +6 -12
  26. datamarket/utils/strings/__init__.py +1 -0
  27. datamarket/utils/strings/normalization.py +217 -0
  28. datamarket/utils/strings/obfuscation.py +153 -0
  29. datamarket/utils/strings/standardization.py +40 -0
  30. datamarket/utils/typer.py +2 -1
  31. datamarket/utils/types.py +1 -0
  32. datamarket-0.10.3.dist-info/METADATA +172 -0
  33. datamarket-0.10.3.dist-info/RECORD +38 -0
  34. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
  35. datamarket-0.6.0.dist-info/METADATA +0 -49
  36. datamarket-0.6.0.dist-info/RECORD +0 -24
  37. datamarket-0.6.0.dist-info/top_level.txt +0 -1
  38. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,655 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import logging
5
+ from contextlib import suppress
6
+ from datetime import timedelta
7
+ from email.utils import parsedate_to_datetime
8
+ from http.cookies import SimpleCookie
9
+ from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
10
+ from urllib.parse import urlparse
11
+
12
+ from bs4 import BeautifulSoup
13
+ from requests.cookies import RequestsCookieJar, create_cookie
14
+ from requests.exceptions import HTTPError
15
+ from rnet import Emulation, Proxy
16
+ from rnet.blocking import Client
17
+ from rnet.blocking import Response as RnetResponse
18
+ from rnet.exceptions import ConnectionError, TimeoutError, TlsError
19
+ from rnet.header import OrigHeaderMap
20
+ from tenacity import (
21
+ before_sleep_log,
22
+ retry,
23
+ retry_if_exception_type,
24
+ retry_if_not_exception_type,
25
+ stop_after_attempt,
26
+ stop_after_delay,
27
+ wait_exponential,
28
+ )
29
+
30
+ from datamarket.exceptions.main import IgnoredHTTPError
31
+
32
+ from ..exceptions import BadRequestError, EmptyResponseError, NotFoundError, RedirectionDetectedError
33
+ from ..interfaces.proxy import ProxyInterface
34
+ from .main import ban_sleep
35
+
36
+ ########################################################################################################################
37
+ # SETUP LOGGER
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ ########################################################################################################################
42
+ # CLASSES
43
+
44
+
45
+ class RnetRequestAdapter:
46
+ """Adapter class for converting requests-style kwargs to rnet kwargs."""
47
+
48
+ @staticmethod
49
+ def _validate_supported_kwargs(requests_kwargs: Mapping[str, Any], supported: set) -> None:
50
+ """Validate that all kwargs are in the supported set."""
51
+ for key in requests_kwargs:
52
+ if key not in supported:
53
+ raise ValueError(
54
+ f"The parameter '{key}' exists in requests but "
55
+ f"is NOT supported by RNET. Remove it or add an explicit mapping."
56
+ )
57
+
58
+ @staticmethod
59
+ def _stringify_mapping(mapping: Mapping[Any, Any]) -> Dict[str, str]:
60
+ """Helper to ensure strict string conversion for keys and values."""
61
+ return {str(k): str(v) for k, v in mapping.items()}
62
+
63
+ @staticmethod
64
+ def _normalize_headers(value: Any) -> Dict[str, str]:
65
+ """Convert headers to a clean string dictionary."""
66
+ if isinstance(value, Mapping):
67
+ return RnetRequestAdapter._stringify_mapping(value)
68
+ try:
69
+ return RnetRequestAdapter._stringify_mapping(dict(value))
70
+ except (TypeError, ValueError) as e:
71
+ raise TypeError(f"Unsupported type for 'headers': {type(value)!r}") from e
72
+
73
+ @staticmethod
74
+ def _build_orig_header_map(clean_headers: Dict[str, str]) -> Optional[OrigHeaderMap]:
75
+ """Build OrigHeaderMap from clean headers to preserve order and casing."""
76
+ if not hasattr(OrigHeaderMap, "insert"):
77
+ return None
78
+
79
+ header_map = OrigHeaderMap()
80
+ for k in clean_headers:
81
+ header_map.insert(k)
82
+ return header_map
83
+
84
+ @staticmethod
85
+ def _map_headers(value: Any) -> Dict[str, Any]:
86
+ """Map headers parameter to rnet kwargs (headers and orig_headers)."""
87
+ rnet_kwargs = {}
88
+ clean_headers = RnetRequestAdapter._normalize_headers(value)
89
+ rnet_kwargs["headers"] = clean_headers
90
+
91
+ header_map = RnetRequestAdapter._build_orig_header_map(clean_headers)
92
+ if header_map is not None:
93
+ rnet_kwargs["orig_headers"] = header_map
94
+
95
+ return rnet_kwargs
96
+
97
+ @staticmethod
98
+ def _normalize_timeout(value: Any) -> Any:
99
+ """
100
+ Normalize timeout value to int or None.
101
+
102
+ WARNING: rnet does not support separate connect and read timeouts.
103
+ If a tuple is provided (connect, read), only the connect timeout is used
104
+ as the TOTAL timeout for the request.
105
+ """
106
+ if isinstance(value, (int, float)):
107
+ return int(value)
108
+
109
+ if isinstance(value, tuple) and len(value) == 2:
110
+ connect_timeout = value[0]
111
+ read_timeout = value[1]
112
+
113
+ # We use the connect_timeout as the total timeout to respect the stricter constraint,
114
+ # but this may cause the read phase to timeout prematurely.
115
+ if connect_timeout is not None and read_timeout is not None:
116
+ logger.warning(
117
+ f"RNET LIMITATION: Separate connect/read timeouts are not supported (received {value}). "
118
+ f"Using the connect timeout ({connect_timeout}s) as the TOTAL timeout. "
119
+ f"The read timeout ({read_timeout}s) is IGNORED."
120
+ )
121
+
122
+ return int(connect_timeout) if connect_timeout is not None else None
123
+
124
+ return value
125
+
126
+ @staticmethod
127
+ def _map_direct_mappings(requests_kwargs: Mapping[str, Any]) -> Dict[str, Any]:
128
+ """Map direct mappings: headers, timeout, allow_redirects, and verify."""
129
+ rnet_kwargs = {}
130
+ direct_map = {
131
+ "headers": "headers",
132
+ "timeout": "timeout",
133
+ "allow_redirects": "allow_redirects",
134
+ "verify": "verify",
135
+ }
136
+
137
+ for src in ["headers", "timeout", "allow_redirects", "verify"]:
138
+ if src in requests_kwargs and requests_kwargs[src] is not None:
139
+ value = requests_kwargs[src]
140
+ dst = direct_map[src]
141
+
142
+ if src == "headers":
143
+ rnet_kwargs.update(RnetRequestAdapter._map_headers(value))
144
+ elif src == "timeout":
145
+ rnet_kwargs[dst] = RnetRequestAdapter._normalize_timeout(value)
146
+ else:
147
+ rnet_kwargs[dst] = value
148
+
149
+ return rnet_kwargs
150
+
151
+ @staticmethod
152
+ def _map_query(requests_kwargs: Mapping[str, Any]) -> Dict[str, Any]:
153
+ """Map params to query."""
154
+ rnet_kwargs = {}
155
+ params = requests_kwargs.get("params")
156
+
157
+ if params is None:
158
+ return rnet_kwargs
159
+
160
+ if isinstance(params, Mapping):
161
+ rnet_kwargs["query"] = RnetRequestAdapter._stringify_mapping(params)
162
+ return rnet_kwargs
163
+
164
+ if not isinstance(params, (str, bytes, bytearray)):
165
+ with suppress(TypeError, ValueError):
166
+ rnet_kwargs["query"] = [(str(k), str(v)) for k, v in params]
167
+ return rnet_kwargs
168
+
169
+ raise TypeError(
170
+ "Unsupported format for 'params'. Expected a mapping or an iterable of "
171
+ "(key, value) pairs (e.g. [('a', 1), ('b', 2)]). "
172
+ f"Got type {type(params)!r}."
173
+ )
174
+
175
+ @staticmethod
176
+ def _map_body_and_files(requests_kwargs: Mapping[str, Any]) -> Dict[str, Any]:
177
+ """Map json, data, and files to appropriate rnet fields."""
178
+ rnet_kwargs = {}
179
+
180
+ json_data = requests_kwargs.get("json")
181
+ if json_data is not None:
182
+ if not isinstance(json_data, Mapping):
183
+ raise TypeError("Rnet 'json' expects a dict-like object.")
184
+ rnet_kwargs["json"] = dict(json_data)
185
+
186
+ data = requests_kwargs.get("data")
187
+ if data is not None:
188
+ if isinstance(data, Mapping):
189
+ rnet_kwargs["form"] = RnetRequestAdapter._stringify_mapping(data)
190
+
191
+ elif not isinstance(data, (str, bytes, bytearray)):
192
+ with suppress(TypeError, ValueError):
193
+ rnet_kwargs["form"] = [(str(k), str(v)) for k, v in data]
194
+ return rnet_kwargs
195
+
196
+ raise TypeError(
197
+ "Unsupported format for 'data'. Expected a mapping or an iterable of "
198
+ "(key, value) pairs (e.g. [('a', 1), ('b', 2)]). "
199
+ f"Got type {type(data)!r}."
200
+ )
201
+
202
+ else:
203
+ rnet_kwargs["body"] = data
204
+
205
+ if requests_kwargs.get("files") is not None:
206
+ raise NotImplementedError("Mapping 'files' -> Rnet 'multipart' is not implemented yet.")
207
+
208
+ return rnet_kwargs
209
+
210
+ @staticmethod
211
+ def _map_auth(requests_kwargs: Mapping[str, Any]) -> Dict[str, Any]:
212
+ """Map auth to basic_auth or auth."""
213
+ rnet_kwargs = {}
214
+ auth = requests_kwargs.get("auth")
215
+ if auth:
216
+ if isinstance(auth, tuple) and len(auth) == 2:
217
+ user, pwd = auth
218
+ rnet_kwargs["basic_auth"] = (str(user), None if pwd is None else str(pwd))
219
+ else:
220
+ if not isinstance(auth, str):
221
+ raise TypeError("Rnet 'auth' only supports string values (e.g. 'user:pass').")
222
+ rnet_kwargs["auth"] = auth
223
+ return rnet_kwargs
224
+
225
+ @staticmethod
226
+ def _map_proxy(requests_kwargs: Mapping[str, Any]) -> Dict[str, Any]:
227
+ """Map requests 'proxies' (dict o Proxy) a rnet 'proxy'."""
228
+ rnet_kwargs: Dict[str, Any] = {}
229
+
230
+ proxies = requests_kwargs.get("proxies")
231
+ if proxies is None:
232
+ return rnet_kwargs
233
+
234
+ if isinstance(proxies, Mapping):
235
+ url = proxies.get("https") or proxies.get("http")
236
+ if url is None:
237
+ raise ValueError("No suitable proxy URL found in 'proxies' dict")
238
+
239
+ rnet_kwargs["proxy"] = Proxy(url)
240
+ else:
241
+ rnet_kwargs["proxy"] = proxies
242
+
243
+ return rnet_kwargs
244
+
245
+ @staticmethod
246
+ def _map_cookies(requests_kwargs: Mapping[str, Any]) -> Dict[str, str]:
247
+ """Convert 'cookies' arg to a Cookie header string."""
248
+ cookies = requests_kwargs.get("cookies")
249
+ if not cookies:
250
+ return {}
251
+
252
+ cookie_list = []
253
+ if isinstance(cookies, Mapping):
254
+ for k, v in cookies.items():
255
+ cookie_list.append(f"{k}={v}")
256
+ else:
257
+ # Assume it's a CookieJar or iterable
258
+ try:
259
+ for c in cookies:
260
+ # RequestsCookieJar yields cookies, specific cookie objects, or sometimes keys
261
+ # depending on iteration. Safe access via name/value attributes.
262
+ if hasattr(c, "name") and hasattr(c, "value"):
263
+ cookie_list.append(f"{c.name}={c.value}")
264
+ elif isinstance(c, tuple) and len(c) == 2:
265
+ cookie_list.append(f"{c[0]}={c[1]}")
266
+ except TypeError:
267
+ pass
268
+
269
+ if not cookie_list:
270
+ return {}
271
+
272
+ return {"Cookie": "; ".join(cookie_list)}
273
+
274
+ @staticmethod
275
+ def requests_to_rnet_kwargs(**requests_kwargs: Any) -> Dict[str, Any]:
276
+ supported = {
277
+ "headers",
278
+ "timeout",
279
+ "allow_redirects",
280
+ "params",
281
+ "json",
282
+ "data",
283
+ "files",
284
+ "auth",
285
+ "proxies",
286
+ "cookies",
287
+ "verify",
288
+ }
289
+
290
+ RnetRequestAdapter._validate_supported_kwargs(requests_kwargs, supported)
291
+
292
+ rnet_kwargs: Dict[str, Any] = {}
293
+ rnet_kwargs.update(RnetRequestAdapter._map_direct_mappings(requests_kwargs))
294
+ rnet_kwargs.update(RnetRequestAdapter._map_query(requests_kwargs))
295
+ rnet_kwargs.update(RnetRequestAdapter._map_body_and_files(requests_kwargs))
296
+ rnet_kwargs.update(RnetRequestAdapter._map_auth(requests_kwargs))
297
+ rnet_kwargs.update(RnetRequestAdapter._map_proxy(requests_kwargs))
298
+
299
+ # Handle Cookies: Convert to header and merge into existing headers
300
+ cookie_header = RnetRequestAdapter._map_cookies(requests_kwargs)
301
+ if cookie_header:
302
+ if "headers" not in rnet_kwargs:
303
+ rnet_kwargs["headers"] = {}
304
+
305
+ # Merge logic: if Cookie exists, append; otherwise set.
306
+ existing_key = next((k for k in rnet_kwargs["headers"] if k.lower() == "cookie"), None)
307
+ if existing_key:
308
+ rnet_kwargs["headers"][existing_key] = (
309
+ f"{rnet_kwargs['headers'][existing_key]}; {cookie_header['Cookie']}"
310
+ )
311
+ else:
312
+ rnet_kwargs["headers"]["Cookie"] = cookie_header["Cookie"]
313
+
314
+ return rnet_kwargs
315
+
316
+
317
+ class RequestsCompatibleResponse:
318
+ """
319
+ A wrapper around rnet Response that provides backward compatibility with requests.Response API.
320
+ """
321
+
322
+ def __init__(self, rnet_response: RnetResponse):
323
+ self._rnet_response = rnet_response
324
+
325
+ @property
326
+ def text(self) -> str:
327
+ return self._rnet_response.text()
328
+
329
+ @property
330
+ def content(self) -> bytes:
331
+ return self._rnet_response.bytes()
332
+
333
+ @property
334
+ def status_code(self) -> int:
335
+ return self._rnet_response.status.as_int()
336
+
337
+ @property
338
+ def headers(self) -> Dict[str, str]:
339
+ headers = {}
340
+ for key, value in self._rnet_response.headers:
341
+ key_str = key.decode("utf-8") if isinstance(key, bytes) else str(key)
342
+ value_str = value.decode("utf-8") if isinstance(value, bytes) else str(value)
343
+ headers[key_str] = value_str
344
+ return headers
345
+
346
+ @property
347
+ def url(self) -> str:
348
+ return str(self._rnet_response.url)
349
+
350
+ @property
351
+ def ok(self) -> bool:
352
+ return self._rnet_response.status.is_success()
353
+
354
+ @property
355
+ def cookies(self) -> RequestsCookieJar:
356
+ jar = RequestsCookieJar()
357
+
358
+ raw = getattr(self._rnet_response, "cookies", None)
359
+ if raw is not None:
360
+ try:
361
+ items = raw.items() if hasattr(raw, "items") else raw
362
+ for k, v in items:
363
+ jar.set(k, v)
364
+ except (TypeError, ValueError):
365
+ pass
366
+
367
+ host = urlparse(self.url).hostname
368
+ for k, v in self._rnet_response.headers:
369
+ key = k.decode("utf-8") if isinstance(k, bytes) else str(k)
370
+ if key.lower() == "set-cookie":
371
+ val = v.decode("utf-8") if isinstance(v, bytes) else str(v)
372
+ sc = SimpleCookie()
373
+ sc.load(val)
374
+ for name, morsel in sc.items():
375
+ raw = morsel["expires"]
376
+ try:
377
+ expires = (
378
+ int(raw)
379
+ if raw and raw.isdigit()
380
+ else (int(parsedate_to_datetime(raw).timestamp()) if raw else None)
381
+ )
382
+ except (ValueError, TypeError, AttributeError):
383
+ expires = None
384
+
385
+ ck = create_cookie(
386
+ name=name,
387
+ value=morsel.value,
388
+ domain=morsel["domain"] or host,
389
+ path=morsel["path"] or "/",
390
+ secure=bool(morsel["secure"]),
391
+ expires=expires,
392
+ rest={"HttpOnly": morsel["httponly"]} if morsel["httponly"] else None,
393
+ )
394
+ jar.set_cookie(ck)
395
+
396
+ return jar
397
+
398
+ def raise_for_status(self) -> None:
399
+ if not self._rnet_response.status.is_success():
400
+ status_code = self._rnet_response.status.as_int()
401
+ url = str(self._rnet_response.url)
402
+ error = HTTPError(f"HTTP {status_code} error for {url}")
403
+ error.response = self
404
+ raise error
405
+
406
+ def bytes(self) -> bytes:
407
+ return self._rnet_response.bytes()
408
+
409
+ def json(self) -> Any:
410
+ return self._rnet_response.json()
411
+
412
+ @property
413
+ def status(self):
414
+ return self._rnet_response.status
415
+
416
+ def __getattr__(self, name):
417
+ return getattr(self._rnet_response, name)
418
+
419
+
420
+ class RequestsClient:
421
+ """A robust, proxy-enabled HTTP client with retry logic and flexible output formats."""
422
+
423
+ # 1. FORBIDDEN HEADERS:
424
+ # We strip these entirely from user input. This forces rnet to generate them
425
+ # based on the selected Emulation (e.g., Firefox143).
426
+ MANAGED_HEADERS_TO_STRIP = {
427
+ "user-agent",
428
+ "connection",
429
+ "dnt",
430
+ "pragma",
431
+ "cache-control",
432
+ "upgrade-insecure-requests",
433
+ "priority",
434
+ "sec-ch-ua",
435
+ "sec-ch-ua-mobile",
436
+ "sec-ch-ua-platform",
437
+ }
438
+
439
+ # 2. OVERRIDE HEADERS:
440
+ # If the user provides these, we must use them EXACTLY as provided.
441
+ CRITICAL_OVERRIDE_HEADERS = {
442
+ "accept",
443
+ "accept-language",
444
+ "accept-encoding",
445
+ "sec-fetch-dest",
446
+ "sec-fetch-mode",
447
+ "sec-fetch-site",
448
+ "sec-fetch-user",
449
+ }
450
+
451
+ _REDIRECT_STATUS_CODES = set(range(300, 309))
452
+
453
+ def __init__(self, proxy_interface: Optional[ProxyInterface] = None):
454
+ self.proxy_interface = proxy_interface
455
+ # Default client for general use
456
+ self.client = Client(
457
+ emulation=Emulation.Firefox143,
458
+ cookie_store=True,
459
+ allow_redirects=True,
460
+ max_redirects=10,
461
+ )
462
+ # Cache for specialized clients
463
+ self._client_cache: Dict[Tuple[Tuple[str, str], ...], Client] = {}
464
+
465
+ def _get_cached_client(self, headers: Dict[str, str], allow_redirects: bool) -> Client:
466
+ """
467
+ Retrieves a cached Client instance or creates a new one if the specific
468
+ header/redirect configuration hasn't been seen before.
469
+ """
470
+ # Convert headers dict to a sorted tuple of items to make it hashable
471
+ # e.g., (('accept', 'application/json'), ('accept-language', 'en-US'))
472
+ headers_key = tuple(sorted(headers.items()))
473
+
474
+ # storage key includes headers and the allow_redirects flag
475
+ cache_key = (headers_key, allow_redirects)
476
+
477
+ if cache_key not in self._client_cache:
478
+ # specific logging to track when we actually incur the cost of creation
479
+ logger.debug(f"Initializing new rnet Client for specific headers: {headers.keys()}")
480
+
481
+ # Optional: Simple guard to prevent memory leaks if headers are randomized per request
482
+ if len(self._client_cache) > 50:
483
+ self._client_cache.clear()
484
+
485
+ self._client_cache[cache_key] = Client(
486
+ emulation=Emulation.Firefox143,
487
+ cookie_store=True,
488
+ allow_redirects=allow_redirects,
489
+ max_redirects=10,
490
+ headers=headers,
491
+ )
492
+
493
+ return self._client_cache[cache_key]
494
+
495
+ def _process_headers(self, headers: Dict[str, Any]) -> Tuple[Dict[str, str], Dict[str, str]]:
496
+ """
497
+ Splits headers into client_init_headers and request_headers.
498
+ """
499
+ if not headers:
500
+ return {}, {}
501
+
502
+ client_init_headers = {}
503
+ request_headers = {}
504
+
505
+ for key, value in headers.items():
506
+ key_lower = str(key).lower()
507
+
508
+ if key_lower in self.MANAGED_HEADERS_TO_STRIP:
509
+ continue
510
+
511
+ if key_lower in self.CRITICAL_OVERRIDE_HEADERS:
512
+ client_init_headers[key] = value
513
+ continue
514
+
515
+ request_headers[key] = value
516
+
517
+ return client_init_headers, request_headers
518
+
519
+ @retry(
520
+ retry=retry_if_exception_type((TlsError, TimeoutError, ConnectionError)),
521
+ wait=wait_exponential(exp_base=3, multiplier=3, max=60),
522
+ stop=stop_after_delay(timedelta(minutes=10)),
523
+ before_sleep=before_sleep_log(logger, logging.WARNING),
524
+ reraise=True,
525
+ )
526
+ def _request_with_proxy_retry(self, url: str, method: str, use_auth: bool, **params):
527
+ logger.info(f"Fetching data from {url} ...")
528
+
529
+ proxy_obj = None
530
+ if self.proxy_interface:
531
+ host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=use_auth)
532
+ if host and port:
533
+ proxy_url = f"http://{host}:{port}"
534
+ proxy_obj = Proxy.all(proxy_url, username=user, password=pwd) if user and pwd else Proxy.all(proxy_url)
535
+ logger.info(f"Using proxy: {host}:{port}")
536
+
537
+ request_params = params.copy()
538
+
539
+ if proxy_obj:
540
+ request_params["proxies"] = proxy_obj
541
+
542
+ client_init_headers = {}
543
+
544
+ if "headers" in request_params:
545
+ client_init_headers, request_method_headers = self._process_headers(request_params["headers"])
546
+ request_params["headers"] = request_method_headers
547
+
548
+ if client_init_headers:
549
+ active_client = self._get_cached_client(
550
+ headers=client_init_headers, allow_redirects=request_params.get("allow_redirects", True)
551
+ )
552
+ else:
553
+ active_client = self.client
554
+
555
+ # Convert args (including cookies) to rnet format
556
+ rnet_params = RnetRequestAdapter.requests_to_rnet_kwargs(**request_params)
557
+
558
+ rnet_response = getattr(active_client, method.lower())(url, **rnet_params)
559
+
560
+ return RequestsCompatibleResponse(rnet_response)
561
+
562
+ def _handle_http_error(self, status_code: int, url: str, response, allow_redirects: bool) -> None:
563
+ """
564
+ Handle HTTP errors with special handling for redirects when allow_redirects is False.
565
+
566
+ Args:
567
+ status_code: HTTP status code
568
+ url: Request URL
569
+ response: Response object
570
+ allow_redirects: Whether redirects are allowed
571
+
572
+ Raises:
573
+ RedirectionDetectedError: If a redirect status is received and allow_redirects is False
574
+ NotFoundError: For 404/410 errors
575
+ BadRequestError: For 400 errors
576
+ HTTPError: For other non-2xx status codes
577
+ """
578
+ # Check for redirect status codes when redirects are disabled
579
+
580
+ if not allow_redirects and status_code in self._REDIRECT_STATUS_CODES:
581
+ raise RedirectionDetectedError(
582
+ message=f"HTTP {status_code} redirect detected but allow_redirects is False for {url}",
583
+ response=response,
584
+ )
585
+
586
+ # Standard error handlers
587
+ error_handlers = {
588
+ 404: lambda: NotFoundError(message=f"404 Not Found error for {url}", response=response),
589
+ 410: lambda: NotFoundError(message=f"410 Gone error for {url}", response=response),
590
+ 400: lambda: BadRequestError(message=f"400 Bad Request error for {url}", response=response),
591
+ }
592
+
593
+ if status_code in error_handlers:
594
+ raise error_handlers[status_code]()
595
+
596
+ # Raise for any other non-2xx status
597
+ response.raise_for_status()
598
+
599
+ @retry(
600
+ retry=retry_if_not_exception_type((NotFoundError, BadRequestError, RedirectionDetectedError, IgnoredHTTPError)),
601
+ wait=wait_exponential(exp_base=3, multiplier=3, max=60),
602
+ stop=stop_after_attempt(5),
603
+ before_sleep=before_sleep_log(logger, logging.WARNING),
604
+ reraise=True,
605
+ )
606
+ def get_data(
607
+ self,
608
+ url: str,
609
+ method: str = "GET",
610
+ output: str = "json",
611
+ sleep: tuple = (6, 3),
612
+ use_auth_proxies: bool = False,
613
+ max_proxy_delay: timedelta = timedelta(minutes=10),
614
+ ignored_status_codes: Sequence[int] = (),
615
+ **kwargs,
616
+ ):
617
+ params = kwargs.copy()
618
+
619
+ if "timeout" not in params and "read_timeout" not in params:
620
+ params["timeout"] = timedelta(seconds=30)
621
+ else:
622
+ params["timeout"] = timedelta(params["timeout"])
623
+
624
+ r = self._request_with_proxy_retry.retry_with(stop=stop_after_delay(max_proxy_delay))(
625
+ self, url, method, use_auth_proxies, **params
626
+ )
627
+
628
+ ban_sleep(*sleep)
629
+
630
+ status_code = r.status_code
631
+
632
+ if status_code in ignored_status_codes:
633
+ raise IgnoredHTTPError(message=f"Status {status_code} in ignored_status_codes for URL {url}", response=r)
634
+
635
+ # Check if allow_redirects is explicitly set in params, default to True
636
+ allow_redirects = params.get("allow_redirects", True)
637
+
638
+ # Handle HTTP errors with redirect detection
639
+ self._handle_http_error(status_code, url, r, allow_redirects)
640
+
641
+ response_content = r.content
642
+ if not response_content:
643
+ raise EmptyResponseError(message=f"Empty response received from {url} (status {status_code})", response=r)
644
+
645
+ output_format = {
646
+ "json": lambda: r.json(),
647
+ "text": lambda: r.text,
648
+ "soup": lambda: BeautifulSoup(response_content, "html.parser"),
649
+ "response": lambda: r,
650
+ }
651
+
652
+ if output in output_format:
653
+ return output_format[output]()
654
+
655
+ raise ValueError(f"Unsupported output format: {output}")
@@ -17,9 +17,7 @@ logger = logging.getLogger(__name__)
17
17
 
18
18
 
19
19
  def get_chromedriver_version():
20
- return int(
21
- run_bash_command("/usr/bin/google-chrome --version").split(" ")[2].split(".")[0]
22
- )
20
+ return int(run_bash_command("/usr/bin/google-chrome --version").split(" ")[2].split(".")[0])
23
21
 
24
22
 
25
23
  def get_driver(chrome_options=None, **kwargs):
@@ -38,23 +36,19 @@ def get_driver(chrome_options=None, **kwargs):
38
36
 
39
37
  def wait(driver, css_selector, timeout=30):
40
38
  logger.info(f"waiting for {css_selector}...")
41
- return WebDriverWait(driver, timeout).until(
42
- EC.visibility_of_element_located(("css selector", css_selector))
43
- )
39
+ return WebDriverWait(driver, timeout).until(EC.visibility_of_element_located(("css selector", css_selector)))
44
40
 
45
41
 
46
42
  def wait_and_click(driver, css_selector, timeout=30):
47
43
  logger.info(f"clicking on {css_selector}...")
48
- WebDriverWait(driver, timeout).until(
49
- EC.element_to_be_clickable(("css selector", css_selector))
50
- ).click()
44
+ WebDriverWait(driver, timeout).until(EC.element_to_be_clickable(("css selector", css_selector))).click()
51
45
 
52
46
 
53
47
  def wait_and_fill(driver, css_selector, text_to_fill, timeout=30):
54
48
  logger.info(f"sending text to {css_selector}...")
55
- WebDriverWait(driver, timeout).until(
56
- EC.presence_of_element_located(("css selector", css_selector))
57
- ).send_keys(text_to_fill)
49
+ WebDriverWait(driver, timeout).until(EC.presence_of_element_located(("css selector", css_selector))).send_keys(
50
+ text_to_fill
51
+ )
58
52
 
59
53
 
60
54
  def scroll(driver, css_selector):
@@ -0,0 +1 @@
1
+ from .normalization import * # noqa: F403