datamarket 0.7.105__py3-none-any.whl → 0.7.106__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -22,7 +22,7 @@ class ProxyInterface:
22
22
 
23
23
  def __init__(self, config):
24
24
  self._load_from_config(config)
25
- self.current_index = random.randrange(len(self.entries)) if self.entries else 0 # noqa: S311
25
+ self.current_index = -2 # -2 means no selection made yet, -1 means Tor selected
26
26
  self._health = {} # {entry: {"ok": bool, "last_checked": time.time(), "last_error": str}}
27
27
 
28
28
  def _load_from_config(self, cfg):
@@ -92,6 +92,7 @@ class ProxyInterface:
92
92
  """
93
93
  # Tor handling (skip health check for tor)
94
94
  if use_tor:
95
+ self.current_index = -1 # Indicate Tor is selected
95
96
  if raw:
96
97
  return ("127.0.0.1", "9050", None, None)
97
98
  return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
@@ -135,15 +136,19 @@ class ProxyInterface:
135
136
  if not pool:
136
137
  pool = self.entries
137
138
 
138
- # Find next in pool using current_index
139
- for _ in range(len(self.entries)):
140
- idx = self.current_index
141
- self.current_index = (self.current_index + 1) % len(self.entries)
139
+ # Start from the next index after current_index
140
+ start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
141
+
142
+ # Find next in pool starting from start_idx
143
+ for i in range(len(self.entries)):
144
+ idx = (start_idx + i) % len(self.entries)
142
145
  entry = self.entries[idx]
143
146
  if entry in pool:
147
+ self.current_index = idx # Update to selected index
144
148
  return entry
145
149
 
146
150
  # Fallback to first entry
151
+ self.current_index = 0
147
152
  return self.entries[0]
148
153
 
149
154
  def get_random(self, use_auth=False):
@@ -156,10 +161,10 @@ class ProxyInterface:
156
161
  pool = self.entries
157
162
 
158
163
  entry = random.choice(pool) # noqa: S311
159
- # Update index to after selected entry for round-robin continuity
164
+ # Update index to selected entry
160
165
  try:
161
166
  pos = self.entries.index(entry)
162
- self.current_index = (pos + 1) % len(self.entries)
167
+ self.current_index = pos
163
168
  except ValueError:
164
169
  pass
165
170
 
@@ -190,6 +195,52 @@ class ProxyInterface:
190
195
  logger.error("Failed to renew Tor IP")
191
196
  logger.error(ex)
192
197
 
198
+ def wait_for_new_ip(self, timeout=600, interval=5, check_timeout=5):
199
+ """
200
+ Wait for the IP address of the currently selected proxy to change.
201
+
202
+ :param timeout: Max seconds to wait for IP change
203
+ :param interval: Seconds between IP checks
204
+ :param check_timeout: Timeout for individual IP check requests
205
+ :return: The selected entry (unchanged)
206
+ :raises RuntimeError: If no proxy is available or baseline cannot be determined
207
+ :raises EnsureNewIPTimeoutError: If IP doesn't change within timeout
208
+ """
209
+ # Use currently selected proxy
210
+ if self.current_index == -1:
211
+ # Tor is selected
212
+ entry = ("127.0.0.1", "9050", None, None)
213
+ elif self.current_index >= 0 and self.current_index < len(self.entries):
214
+ # current_index points to the selected entry
215
+ entry = self.entries[self.current_index]
216
+ else:
217
+ # No valid selection, select one
218
+ logger.debug("No proxy currently selected, selecting one for IP waiting")
219
+ self.get_proxies(raw=True)
220
+ if self.current_index == -1:
221
+ entry = ("127.0.0.1", "9050", None, None)
222
+ elif self.current_index >= 0 and self.current_index < len(self.entries):
223
+ entry = self.entries[self.current_index]
224
+ else:
225
+ raise RuntimeError("Could not select a proxy for IP waiting")
226
+
227
+ # Auto-detect baseline IP
228
+ host, port, user, pwd = entry
229
+ proxies_map = {
230
+ "http": self.get_proxy_url(host, port, user, pwd, "http"),
231
+ "https": self.get_proxy_url(host, port, user, pwd, "http"),
232
+ }
233
+ try:
234
+ resp = requests.get(self.CHECK_IP_URL, proxies=proxies_map, timeout=check_timeout)
235
+ baseline = resp.json().get("YourFuckingIPAddress")
236
+ if not baseline:
237
+ raise RuntimeError(f"Could not determine baseline IP for entry {host}:{port}")
238
+ logger.debug(f"Auto-detected baseline IP: {baseline}")
239
+ except Exception as ex:
240
+ raise RuntimeError(f"Could not determine baseline IP for entry {host}:{port}: {ex}") from ex
241
+
242
+ return self._wait_for_new_ip(entry, baseline, timeout, interval, check_timeout)
243
+
193
244
  def mark_entry_status(self, entry, ok, error=None, last_ip=None):
194
245
  """Update health cache for an entry."""
195
246
  self._health[entry] = {
@@ -234,7 +285,9 @@ class ProxyInterface:
234
285
  # Capture baseline before any health checks that might set it
235
286
  baseline_before = None
236
287
  if len(candidates) == 1:
237
- baseline_before = self._health.get(candidates[0], {}).get("last_ip")
288
+ idx = candidates[0]
289
+ entry = self.entries[idx]
290
+ baseline_before = self._health.get(entry, {}).get("last_ip")
238
291
 
239
292
  def _find_working_entry():
240
293
  if not self.entries:
@@ -251,13 +304,13 @@ class ProxyInterface:
251
304
  entry = _find_working_entry()
252
305
 
253
306
  if ensure_new_ip and len(pool) == 1:
254
- logger.debug(f"ensure_new_ip=True and single proxy, handling IP change check: {entry[0]}:{entry[1]}")
255
307
  baseline = self._health.get(entry, {}).get("last_ip")
256
308
  if not baseline_before:
257
309
  # First time seeing this proxy: it was already checked in _find_working_entry, so return immediately
258
310
  logger.debug("No baseline IP found; returning first-working proxy without waiting for IP change")
259
311
  else:
260
312
  # There is a baseline: wait for the IP to change
313
+ logger.info(f"ensure_new_ip=True and single proxy, waiting for IP change: {entry[0]}:{entry[1]}")
261
314
  entry = self._wait_for_new_ip(
262
315
  entry, baseline, ensure_new_ip_timeout, ensure_new_ip_interval, check_timeout
263
316
  )
@@ -267,19 +320,12 @@ class ProxyInterface:
267
320
  def _get_round_robin_candidates(self, pool):
268
321
  """Get candidates in round-robin order starting from current_index."""
269
322
  candidates = []
270
- start_idx = self.current_index
323
+ start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
271
324
  for i in range(len(self.entries)):
272
325
  idx = (start_idx + i) % len(self.entries)
273
326
  entry = self.entries[idx]
274
327
  if entry in pool:
275
- candidates.append(entry)
276
- # Update current_index for next call
277
- if candidates:
278
- try:
279
- pos = self.entries.index(candidates[0])
280
- self.current_index = (pos + 1) % len(self.entries)
281
- except ValueError:
282
- pass
328
+ candidates.append(idx)
283
329
  return candidates
284
330
 
285
331
  def _build_pool(self, use_auth):
@@ -290,12 +336,14 @@ class ProxyInterface:
290
336
 
291
337
  def _get_candidates(self, pool, randomize):
292
338
  if randomize:
293
- return pool[:]
339
+ candidates = [idx for idx, entry in enumerate(self.entries) if entry in pool]
340
+ return random.sample(candidates, k=len(candidates))
294
341
  else:
295
342
  return self._get_round_robin_candidates(pool)
296
343
 
297
344
  def _find_working_entry_once(self, candidates, check_timeout, cooldown_seconds):
298
- for entry in candidates:
345
+ for idx in candidates:
346
+ entry = self.entries[idx]
299
347
  health = self._health.get(entry, {})
300
348
  last_checked = health.get("last_checked", 0)
301
349
  ok = health.get("ok", False)
@@ -303,18 +351,22 @@ class ProxyInterface:
303
351
 
304
352
  if ok and (now - last_checked) < cooldown_seconds:
305
353
  logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
354
+ self.current_index = idx
306
355
  return entry
307
356
  elif not ok and (now - last_checked) < cooldown_seconds:
308
357
  continue
309
358
  else:
310
359
  logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
311
360
  if self.is_entry_alive(entry, timeout=check_timeout):
361
+ self.current_index = idx
312
362
  return entry
313
363
 
314
364
  logger.warning("No cached working proxies, forcing fresh checks")
315
- for entry in candidates:
365
+ for idx in candidates:
366
+ entry = self.entries[idx]
316
367
  logger.debug(f"Force-checking proxy: {entry[0]}:{entry[1]}")
317
368
  if self.is_entry_alive(entry, timeout=check_timeout):
369
+ self.current_index = idx
318
370
  return entry
319
371
 
320
372
  raise NoWorkingProxiesError("No working proxies available")
@@ -335,7 +387,7 @@ class ProxyInterface:
335
387
 
336
388
  if current_ip and current_ip != baseline:
337
389
  self.mark_entry_status(entry, True, last_ip=current_ip)
338
- logger.debug(f"IP changed from {baseline} to {current_ip}")
390
+ logger.info(f"IP changed from {baseline} to {current_ip}")
339
391
  return entry
340
392
 
341
393
  time.sleep(interval)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.105
3
+ Version: 0.7.106
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -9,7 +9,7 @@ datamarket/interfaces/drive.py,sha256=3nhx3THr2SHNWKYwme9F2nPpvsqyEMFIxz0whF2FjH
9
9
  datamarket/interfaces/ftp.py,sha256=LH3Oz19k_xUNhzDXcrq5Ofb4c3uiph5pWUqpgiaDvHI,2671
10
10
  datamarket/interfaces/nominatim.py,sha256=xizT94tVum7QPppfDgI5sEhx1mAXT-SM3JyPl8CDxxU,15148
11
11
  datamarket/interfaces/peerdb.py,sha256=sO451wEGNb_0DDwchZ6eBVYKltqHM5XKau-WsfspXzA,23640
12
- datamarket/interfaces/proxy.py,sha256=j2m7T9mv18XXnkIhApsCDdG2tUAdnmDLWv1nHTNYHNI,13410
12
+ datamarket/interfaces/proxy.py,sha256=e-bbmtjyjkh4ZAuk6o0nm-UcynH4qmDZnzDQzbOasb8,16063
13
13
  datamarket/interfaces/tinybird.py,sha256=cNG-kAPTdQn2inlNX9LPf-VVdtnLud947ApLVO40Now,2594
14
14
  datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  datamarket/params/nominatim.py,sha256=S9TEB4FxmffvFyK9KffWl20TfXzWX69IAdbEehKar1I,11920
@@ -29,7 +29,7 @@ datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnm
29
29
  datamarket/utils/strings/standardization.py,sha256=j_NbT-O1XnxDvDhct8panfkrfAC8R5OX6XM5fYBZ4RU,1496
30
30
  datamarket/utils/typer.py,sha256=geWuwMwGQjBQhxo27hX0vEAeRl1j1TS0u2oFVfpAs5I,816
31
31
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
32
- datamarket-0.7.105.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
- datamarket-0.7.105.dist-info/METADATA,sha256=2uuHjpJUomXAU9YJYoRtnQPDZxPBSfbWtQ6RWgm4srk,7382
34
- datamarket-0.7.105.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
35
- datamarket-0.7.105.dist-info/RECORD,,
32
+ datamarket-0.7.106.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
+ datamarket-0.7.106.dist-info/METADATA,sha256=A76E203rT92P2sSWe_FpjyL4I3vqirVPWHOFAXYlZug,7382
34
+ datamarket-0.7.106.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
35
+ datamarket-0.7.106.dist-info/RECORD,,