datamarket 0.7.102__py3-none-any.whl → 0.7.103__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -19,6 +19,7 @@ class ProxyInterface:
19
19
  def __init__(self, config):
20
20
  self._load_from_config(config)
21
21
  self.current_index = random.randrange(len(self.entries)) if self.entries else 0
22
+ self._health = {} # {entry: {"ok": bool, "last_checked": time.time(), "last_error": str}}
22
23
 
23
24
  def _load_from_config(self, cfg):
24
25
  # Tor password (optional)
@@ -55,23 +56,42 @@ class ProxyInterface:
55
56
  auth = f"{user}:{password}@" if user and password else ""
56
57
  return f"{schema}://{auth}{host}:{port}"
57
58
 
58
- def get_proxies(self, use_tor=False, randomize=False, raw=False, use_auth=False, use_socks=False):
59
+ def get_proxies(
60
+ self,
61
+ use_tor=False,
62
+ randomize=False,
63
+ raw=False,
64
+ use_auth=False,
65
+ use_socks=False,
66
+ health_check=True,
67
+ check_timeout=5,
68
+ cooldown_seconds=600,
69
+ ):
59
70
  """
60
- Return parsed proxy URLs or raw entry tuple.
71
+ Return parsed proxy URLs or raw entry tuple for a working proxy.
61
72
 
62
73
  :param use_tor: route via local Tor SOCKS5 if True
63
74
  :param randomize: select a random proxy if True, otherwise round-robin
64
75
  :param raw: return raw (host, port, user, password) tuple if True
65
76
  :param use_auth: include proxies that require authentication if True; otherwise only credential-free
77
+ :param health_check: perform health checks to ensure proxy is working if True
78
+ :param check_timeout: timeout in seconds for health check requests
79
+ :param cooldown_seconds: how long to cache health status before re-checking
66
80
  """
67
- # Tor handling
81
+ # Tor handling (skip health check for tor)
68
82
  if use_tor:
69
83
  if raw:
70
84
  return ("127.0.0.1", "9050", None, None)
71
85
  return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
72
86
 
73
- # Select entry based on strategy and auth preference
74
- host, port, user, password = self.get_random(use_auth) if randomize else self.get_next(use_auth)
87
+ # Get a working entry (with health checks if enabled)
88
+ if health_check:
89
+ host, port, user, password = self._get_working_entry(
90
+ use_auth=use_auth, randomize=randomize, check_timeout=check_timeout, cooldown_seconds=cooldown_seconds
91
+ )
92
+ else:
93
+ # Legacy behavior: no health check
94
+ host, port, user, password = self.get_random(use_auth) if randomize else self.get_next(use_auth)
75
95
 
76
96
  if raw:
77
97
  return host, port, user, password
@@ -149,3 +169,87 @@ class ProxyInterface:
149
169
  except Exception as ex:
150
170
  logger.error("Failed to renew Tor IP")
151
171
  logger.error(ex)
172
+
173
+ def mark_entry_status(self, entry, ok, error=None):
174
+ """Update health cache for an entry."""
175
+ self._health[entry] = {
176
+ "ok": ok,
177
+ "last_checked": time.time(),
178
+ "last_error": error,
179
+ }
180
+
181
+ def is_entry_alive(self, entry, timeout=5):
182
+ """Check if a proxy entry is working by making a test request."""
183
+ host, port, user, pwd = entry
184
+ try:
185
+ proxies = {
186
+ "http": self.get_proxy_url(host, port, user, pwd, "http"),
187
+ "https": self.get_proxy_url(host, port, user, pwd, "http"),
188
+ }
189
+ resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=timeout)
190
+ ok = resp.status_code == 200
191
+ self.mark_entry_status(entry, ok)
192
+ return ok
193
+ except Exception as ex:
194
+ self.mark_entry_status(entry, False, str(ex))
195
+ return False
196
+
197
+ def _get_working_entry(self, use_auth=False, randomize=False, check_timeout=5, cooldown_seconds=60):
198
+ """Get a working proxy entry, performing health checks as needed."""
199
+ if not self.entries:
200
+ raise RuntimeError("No proxies available")
201
+
202
+ # Build candidate list respecting use_auth and randomize/round-robin
203
+ pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
204
+ if not pool:
205
+ pool = self.entries
206
+
207
+ candidates = pool[:] if randomize else self._get_round_robin_candidates(pool)
208
+
209
+ # First pass: check cache and health
210
+ for entry in candidates:
211
+ health = self._health.get(entry, {})
212
+ last_checked = health.get("last_checked", 0)
213
+ ok = health.get("ok", False)
214
+ now = time.time()
215
+
216
+ if ok and (now - last_checked) < cooldown_seconds:
217
+ # Cached as working and recent
218
+ logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
219
+ return entry
220
+ elif not ok and (now - last_checked) < cooldown_seconds:
221
+ # Cached as failed and recent, skip
222
+ continue
223
+ else:
224
+ # Not cached or expired, check now
225
+ logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
226
+ if self.is_entry_alive(entry, timeout=check_timeout):
227
+ return entry
228
+
229
+ # Second pass: force fresh check for all candidates (in case cache skipped everything)
230
+ logger.warning("No cached working proxies, forcing fresh checks")
231
+ for entry in candidates:
232
+ logger.debug(f"Force-checking proxy: {entry[0]}:{entry[1]}")
233
+ if self.is_entry_alive(entry, timeout=check_timeout):
234
+ return entry
235
+
236
+ # No working proxies found
237
+ raise RuntimeError("No working proxies available")
238
+
239
+ def _get_round_robin_candidates(self, pool):
240
+ """Get candidates in round-robin order starting from current_index."""
241
+ candidates = []
242
+ start_idx = self.current_index
243
+ for i in range(len(self.entries)):
244
+ idx = (start_idx + i) % len(self.entries)
245
+ entry = self.entries[idx]
246
+ if entry in pool:
247
+ candidates.append(entry)
248
+ # Update current_index for next call
249
+ if candidates:
250
+ try:
251
+ pos = self.entries.index(candidates[0])
252
+ self.current_index = (pos + 1) % len(self.entries)
253
+ except ValueError:
254
+ pass
255
+ return candidates
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.102
3
+ Version: 0.7.103
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -9,7 +9,7 @@ datamarket/interfaces/drive.py,sha256=3nhx3THr2SHNWKYwme9F2nPpvsqyEMFIxz0whF2FjH
9
9
  datamarket/interfaces/ftp.py,sha256=K219-PP21EhQo1A1LkvRLahlrw2-pf4svBN0LogZaJE,2813
10
10
  datamarket/interfaces/nominatim.py,sha256=TjS9O2U446XuPUzfP65NwDSG-RDNqmYb6-NKikM-34w,15187
11
11
  datamarket/interfaces/peerdb.py,sha256=sO451wEGNb_0DDwchZ6eBVYKltqHM5XKau-WsfspXzA,23640
12
- datamarket/interfaces/proxy.py,sha256=Uu-dHvpQOLNBZPGHAanLXnKT1789ArcHfOw8exECt34,5398
12
+ datamarket/interfaces/proxy.py,sha256=YNPNDFd2xTF-P5MITRHxGCLiXD8Fal4HK0yN8KhuYgI,9738
13
13
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
14
14
  datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  datamarket/params/nominatim.py,sha256=RnmYXGoJQCijOsuCavCYcxw98WvOd_vOMK4KaraI0RU,11967
@@ -29,7 +29,7 @@ datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnm
29
29
  datamarket/utils/strings/standardization.py,sha256=c8CAG6HI3AfK0hB3A3IGwsbnQebZ6R3PrA5PELHRXM0,1492
30
30
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
31
31
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
32
- datamarket-0.7.102.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
- datamarket-0.7.102.dist-info/METADATA,sha256=FQcOMGhkANO_QeIbb9ISfm_MwcRCEo3TqCeVV5PONnI,7382
34
- datamarket-0.7.102.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
35
- datamarket-0.7.102.dist-info/RECORD,,
32
+ datamarket-0.7.103.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
+ datamarket-0.7.103.dist-info/METADATA,sha256=XAS_V3qRLGddVGC6sRhxavgwa4EdzONa1B-YWnvcMK0,7382
34
+ datamarket-0.7.103.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
35
+ datamarket-0.7.103.dist-info/RECORD,,