datamarket 0.7.101__tar.gz → 0.7.103__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (35) hide show
  1. {datamarket-0.7.101 → datamarket-0.7.103}/PKG-INFO +1 -1
  2. {datamarket-0.7.101 → datamarket-0.7.103}/pyproject.toml +1 -1
  3. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/exceptions/main.py +13 -0
  4. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/interfaces/proxy.py +109 -5
  5. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/main.py +39 -1
  6. {datamarket-0.7.101 → datamarket-0.7.103}/LICENSE +0 -0
  7. {datamarket-0.7.101 → datamarket-0.7.103}/README.md +0 -0
  8. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/__init__.py +0 -0
  9. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/exceptions/__init__.py +0 -0
  10. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/interfaces/__init__.py +0 -0
  11. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/interfaces/alchemy.py +0 -0
  12. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/interfaces/aws.py +0 -0
  13. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/interfaces/azure.py +0 -0
  14. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/interfaces/drive.py +0 -0
  15. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/interfaces/ftp.py +0 -0
  16. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/interfaces/nominatim.py +0 -0
  17. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/interfaces/peerdb.py +0 -0
  18. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/interfaces/tinybird.py +0 -0
  19. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/params/__init__.py +0 -0
  20. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/params/nominatim.py +0 -0
  21. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/__init__.py +0 -0
  22. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/airflow.py +0 -0
  23. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/alchemy.py +0 -0
  24. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/nominatim.py +0 -0
  25. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/playwright/__init__.py +0 -0
  26. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/playwright/async_api.py +0 -0
  27. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/playwright/sync_api.py +0 -0
  28. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/selenium.py +0 -0
  29. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/soda.py +0 -0
  30. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/strings/__init__.py +0 -0
  31. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/strings/normalization.py +0 -0
  32. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/strings/obfuscation.py +0 -0
  33. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/strings/standardization.py +0 -0
  34. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/typer.py +0 -0
  35. {datamarket-0.7.101 → datamarket-0.7.103}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.101
3
+ Version: 0.7.103
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.7.101"
3
+ version = "0.7.103"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -2,6 +2,9 @@
2
2
  # CLASSES
3
3
 
4
4
 
5
+ import requests
6
+
7
+
5
8
  class RedirectionDetectedError(Exception):
6
9
  def __init__(self, message="Redirection detected!"):
7
10
  self.message = message
@@ -18,3 +21,13 @@ class BadRequestError(Exception):
18
21
  def __init__(self, message="Bad request!"):
19
22
  self.message = message
20
23
  super().__init__(self.message)
24
+
25
+ class ManagedHTTPError(Exception):
26
+ """Signal that this HTTP status was handled and should not be retried."""
27
+ def __init__(self, response: requests.Response, *, url: str | None = None, message: str | None = None):
28
+ self.response = response
29
+ self.request = getattr(response, "request", None)
30
+ self.status_code = getattr(response, "status_code", None)
31
+ self.url = url or (self.request.url if self.request is not None else None)
32
+ self.message = message
33
+ super().__init__(message or f"HTTP {self.status_code} for {self.url}")
@@ -19,6 +19,7 @@ class ProxyInterface:
19
19
  def __init__(self, config):
20
20
  self._load_from_config(config)
21
21
  self.current_index = random.randrange(len(self.entries)) if self.entries else 0
22
+ self._health = {} # {entry: {"ok": bool, "last_checked": time.time(), "last_error": str}}
22
23
 
23
24
  def _load_from_config(self, cfg):
24
25
  # Tor password (optional)
@@ -55,23 +56,42 @@ class ProxyInterface:
55
56
  auth = f"{user}:{password}@" if user and password else ""
56
57
  return f"{schema}://{auth}{host}:{port}"
57
58
 
58
- def get_proxies(self, use_tor=False, randomize=False, raw=False, use_auth=False, use_socks=False):
59
+ def get_proxies(
60
+ self,
61
+ use_tor=False,
62
+ randomize=False,
63
+ raw=False,
64
+ use_auth=False,
65
+ use_socks=False,
66
+ health_check=True,
67
+ check_timeout=5,
68
+ cooldown_seconds=600,
69
+ ):
59
70
  """
60
- Return parsed proxy URLs or raw entry tuple.
71
+ Return parsed proxy URLs or raw entry tuple for a working proxy.
61
72
 
62
73
  :param use_tor: route via local Tor SOCKS5 if True
63
74
  :param randomize: select a random proxy if True, otherwise round-robin
64
75
  :param raw: return raw (host, port, user, password) tuple if True
65
76
  :param use_auth: include proxies that require authentication if True; otherwise only credential-free
77
+ :param health_check: perform health checks to ensure proxy is working if True
78
+ :param check_timeout: timeout in seconds for health check requests
79
+ :param cooldown_seconds: how long to cache health status before re-checking
66
80
  """
67
- # Tor handling
81
+ # Tor handling (skip health check for tor)
68
82
  if use_tor:
69
83
  if raw:
70
84
  return ("127.0.0.1", "9050", None, None)
71
85
  return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
72
86
 
73
- # Select entry based on strategy and auth preference
74
- host, port, user, password = self.get_random(use_auth) if randomize else self.get_next(use_auth)
87
+ # Get a working entry (with health checks if enabled)
88
+ if health_check:
89
+ host, port, user, password = self._get_working_entry(
90
+ use_auth=use_auth, randomize=randomize, check_timeout=check_timeout, cooldown_seconds=cooldown_seconds
91
+ )
92
+ else:
93
+ # Legacy behavior: no health check
94
+ host, port, user, password = self.get_random(use_auth) if randomize else self.get_next(use_auth)
75
95
 
76
96
  if raw:
77
97
  return host, port, user, password
@@ -149,3 +169,87 @@ class ProxyInterface:
149
169
  except Exception as ex:
150
170
  logger.error("Failed to renew Tor IP")
151
171
  logger.error(ex)
172
+
173
+ def mark_entry_status(self, entry, ok, error=None):
174
+ """Update health cache for an entry."""
175
+ self._health[entry] = {
176
+ "ok": ok,
177
+ "last_checked": time.time(),
178
+ "last_error": error,
179
+ }
180
+
181
+ def is_entry_alive(self, entry, timeout=5):
182
+ """Check if a proxy entry is working by making a test request."""
183
+ host, port, user, pwd = entry
184
+ try:
185
+ proxies = {
186
+ "http": self.get_proxy_url(host, port, user, pwd, "http"),
187
+ "https": self.get_proxy_url(host, port, user, pwd, "http"),
188
+ }
189
+ resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=timeout)
190
+ ok = resp.status_code == 200
191
+ self.mark_entry_status(entry, ok)
192
+ return ok
193
+ except Exception as ex:
194
+ self.mark_entry_status(entry, False, str(ex))
195
+ return False
196
+
197
+ def _get_working_entry(self, use_auth=False, randomize=False, check_timeout=5, cooldown_seconds=60):
198
+ """Get a working proxy entry, performing health checks as needed."""
199
+ if not self.entries:
200
+ raise RuntimeError("No proxies available")
201
+
202
+ # Build candidate list respecting use_auth and randomize/round-robin
203
+ pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
204
+ if not pool:
205
+ pool = self.entries
206
+
207
+ candidates = pool[:] if randomize else self._get_round_robin_candidates(pool)
208
+
209
+ # First pass: check cache and health
210
+ for entry in candidates:
211
+ health = self._health.get(entry, {})
212
+ last_checked = health.get("last_checked", 0)
213
+ ok = health.get("ok", False)
214
+ now = time.time()
215
+
216
+ if ok and (now - last_checked) < cooldown_seconds:
217
+ # Cached as working and recent
218
+ logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
219
+ return entry
220
+ elif not ok and (now - last_checked) < cooldown_seconds:
221
+ # Cached as failed and recent, skip
222
+ continue
223
+ else:
224
+ # Not cached or expired, check now
225
+ logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
226
+ if self.is_entry_alive(entry, timeout=check_timeout):
227
+ return entry
228
+
229
+ # Second pass: force fresh check for all candidates (in case cache skipped everything)
230
+ logger.warning("No cached working proxies, forcing fresh checks")
231
+ for entry in candidates:
232
+ logger.debug(f"Force-checking proxy: {entry[0]}:{entry[1]}")
233
+ if self.is_entry_alive(entry, timeout=check_timeout):
234
+ return entry
235
+
236
+ # No working proxies found
237
+ raise RuntimeError("No working proxies available")
238
+
239
+ def _get_round_robin_candidates(self, pool):
240
+ """Get candidates in round-robin order starting from current_index."""
241
+ candidates = []
242
+ start_idx = self.current_index
243
+ for i in range(len(self.entries)):
244
+ idx = (start_idx + i) % len(self.entries)
245
+ entry = self.entries[idx]
246
+ if entry in pool:
247
+ candidates.append(entry)
248
+ # Update current_index for next call
249
+ if candidates:
250
+ try:
251
+ pos = self.entries.index(candidates[0])
252
+ self.current_index = (pos + 1) % len(self.entries)
253
+ except ValueError:
254
+ pass
255
+ return candidates
@@ -10,6 +10,7 @@ import re
10
10
  import shlex
11
11
  import subprocess
12
12
  import time
13
+ from typing import Sequence
13
14
  from babel.numbers import parse_decimal
14
15
 
15
16
  from bs4 import BeautifulSoup
@@ -26,6 +27,8 @@ from tenacity import (
26
27
  wait_exponential,
27
28
  )
28
29
 
30
+ from datamarket.exceptions.main import ManagedHTTPError
31
+
29
32
  from ..exceptions import RedirectionDetectedError, NotFoundError, BadRequestError
30
33
  from ..interfaces.proxy import ProxyInterface
31
34
 
@@ -132,7 +135,9 @@ def parse_field(dict_struct, field_path, format_method=None):
132
135
 
133
136
 
134
137
  @retry(
135
- retry=retry_if_not_exception_type((NotFoundError, BadRequestError, RedirectionDetectedError, ProxyError)),
138
+ retry=retry_if_not_exception_type(
139
+ (NotFoundError, BadRequestError, RedirectionDetectedError, ProxyError, ManagedHTTPError)
140
+ ),
136
141
  wait=wait_exponential(exp_base=3, multiplier=3, max=60),
137
142
  stop=stop_after_attempt(5),
138
143
  before_sleep=before_sleep_log(logger, logging.WARNING),
@@ -146,8 +151,37 @@ def get_data(
146
151
  proxy_interface: ProxyInterface = None,
147
152
  use_auth_proxies: bool = False,
148
153
  max_proxy_delay: timedelta = timedelta(minutes=10),
154
+ ignored_status_codes: Sequence[int] = (),
149
155
  **kwargs,
150
156
  ):
157
+ """
158
+ Fetches data from a given URL using HTTP requests, with support for proxy configuration, retries, and flexible output formats.
159
+
160
+ Args:
161
+ url (str): The target URL to fetch data from.
162
+ method (str, optional): HTTP method to use (e.g., 'GET', 'POST'). Defaults to 'GET'.
163
+ output (str, optional): Output format ('json', 'text', 'soup', 'response'). Defaults to 'json'.
164
+ sleep (tuple, optional): Tuple specifying max and min sleep times (seconds) after request. Defaults to (6, 3).
165
+ proxy_interface (ProxyInterface, optional): Proxy provider. If None, no proxy is used. Defaults to None.
166
+ use_auth_proxies (bool, optional): Whether to use authenticated proxies. Defaults to False.
167
+ max_proxy_delay (timedelta, optional): Maximum delay for proxy retry logic. Defaults to 10 minutes.
168
+ ignored_status_codes (Sequence[int], optional): Status codes to ignore and return response for. Defaults to ().
169
+ **kwargs: Additional arguments passed to the requests method.
170
+
171
+ Returns:
172
+ Depends on the 'output' argument:
173
+ - 'json': Parsed JSON response.
174
+ - 'text': Response text.
175
+ - 'soup': BeautifulSoup-parsed HTML.
176
+ - 'response': Raw requests.Response object.
177
+
178
+ Raises:
179
+ ManagedHTTPError: If a response status code is in `ignored_status_codes`.
180
+ NotFoundError: If a 404 status code is returned.
181
+ BadRequestError: If a 400 status code is returned.
182
+ RedirectionDetectedError, ProxyError: On specific error conditions.
183
+ requests.HTTPError: For other HTTP errors if not ignored.
184
+ """
151
185
  retry_type = retry_if_exception_type(ProxyError)
152
186
  wait = wait_exponential(exp_base=3, multiplier=3, max=60)
153
187
  stop = stop_after_delay(max_proxy_delay)
@@ -174,11 +208,15 @@ def get_data(
174
208
 
175
209
  ban_sleep(*sleep)
176
210
 
211
+ if r.status_code in ignored_status_codes:
212
+ raise ManagedHTTPError(r, url=url, message=f"Status {r.status_code} in ignored_status_codes for URL {url}")
177
213
  if r.status_code == 404:
178
214
  raise NotFoundError(f"404 Not Found error for {url}")
179
215
  if r.status_code == 400:
180
216
  raise BadRequestError(f"400 Bad Request error for {url}")
217
+
181
218
  r.raise_for_status()
219
+
182
220
  r.encoding = "utf-8"
183
221
 
184
222
  if output == "json":
File without changes
File without changes