datamarket 0.7.106__py3-none-any.whl → 0.7.108__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -1,6 +1,10 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
1
4
  import logging
2
5
  import random
3
6
  import time
7
+ from datetime import timedelta
4
8
 
5
9
  import requests
6
10
  import tenacity
@@ -9,9 +13,18 @@ from stem.control import Controller
9
13
 
10
14
  from datamarket.exceptions import EnsureNewIPTimeoutError, NoWorkingProxiesError
11
15
 
16
+ ########################################################################################################################
17
+ # SETUP
18
+
12
19
  logger = logging.getLogger(__name__)
13
20
  logging.getLogger("stem").setLevel(logging.WARNING)
14
21
 
22
+ PROXY_ROTATION_INTERVAL = timedelta(minutes=10)
23
+ PROXY_ROTATION_TIMEOUT_SECONDS = int(PROXY_ROTATION_INTERVAL.total_seconds())
24
+
25
+ ########################################################################################################################
26
+ # CLASSES
27
+
15
28
 
16
29
  class ProxyInterface:
17
30
  """
@@ -24,6 +37,10 @@ class ProxyInterface:
24
37
  self._load_from_config(config)
25
38
  self.current_index = -2 # -2 means no selection made yet, -1 means Tor selected
26
39
  self._health = {} # {entry: {"ok": bool, "last_checked": time.time(), "last_error": str}}
40
+ self._traversal_queue = [] # Queue of indices left to test in current traversal
41
+ self._traversal_start = None # Timestamp when current traversal started
42
+ self._last_ip_wait = {} # {entry: (timestamp, traversal_cycle)} - last time we attempted to wait for IP change
43
+ self._traversal_cycle = 0 # Counter of full traversals of the queue
27
44
 
28
45
  def _load_from_config(self, cfg):
29
46
  # Tor password (optional)
@@ -67,13 +84,9 @@ class ProxyInterface:
67
84
  raw=False,
68
85
  use_auth=False,
69
86
  use_socks=False,
70
- health_check=True,
71
87
  check_timeout=5,
72
- cooldown_seconds=600,
73
- ensure_new_ip=False,
74
- ensure_new_ip_timeout=600,
75
- ensure_new_ip_interval=5,
76
- max_retry_seconds=600,
88
+ cooldown_seconds=30,
89
+ proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
77
90
  ):
78
91
  """
79
92
  Return parsed proxy URLs or raw entry tuple for a working proxy.
@@ -82,13 +95,9 @@ class ProxyInterface:
82
95
  :param randomize: select a random proxy if True, otherwise round-robin
83
96
  :param raw: return raw (host, port, user, password) tuple if True
84
97
  :param use_auth: include proxies that require authentication if True; otherwise only credential-free
85
- :param health_check: perform health checks to ensure proxy is working if True
86
98
  :param check_timeout: timeout in seconds for health check requests
87
99
  :param cooldown_seconds: how long to cache health status before re-checking
88
- :param ensure_new_ip: if True and only one proxy available, wait until IP changes before returning
89
- :param ensure_new_ip_timeout: max seconds to wait for IP change when ensure_new_ip=True
90
- :param ensure_new_ip_interval: seconds between IP checks when ensure_new_ip=True
91
- :param max_retry_seconds: max seconds to retry finding working proxies (0 to disable)
100
+ :param proxy_rotation_interval: max time to retry finding working proxies (timedelta or seconds, 0 to disable)
92
101
  """
93
102
  # Tor handling (skip health check for tor)
94
103
  if use_tor:
@@ -98,20 +107,13 @@ class ProxyInterface:
98
107
  return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
99
108
 
100
109
  # Get a working entry (with health checks if enabled)
101
- if health_check:
102
- host, port, user, password = self._get_working_entry(
103
- use_auth=use_auth,
104
- randomize=randomize,
105
- check_timeout=check_timeout,
106
- cooldown_seconds=cooldown_seconds,
107
- ensure_new_ip=ensure_new_ip,
108
- ensure_new_ip_timeout=ensure_new_ip_timeout,
109
- ensure_new_ip_interval=ensure_new_ip_interval,
110
- max_retry_seconds=max_retry_seconds,
111
- )
112
- else:
113
- # Legacy behavior: no health check
114
- host, port, user, password = self.get_random(use_auth) if randomize else self.get_next(use_auth)
110
+ host, port, user, password = self._get_working_entry(
111
+ use_auth=use_auth,
112
+ randomize=randomize,
113
+ check_timeout=check_timeout,
114
+ cooldown_seconds=cooldown_seconds,
115
+ proxy_rotation_interval=proxy_rotation_interval,
116
+ )
115
117
 
116
118
  if raw:
117
119
  return host, port, user, password
@@ -127,49 +129,6 @@ class ProxyInterface:
127
129
  "https": self.get_proxy_url(host, port, user, password, "http"),
128
130
  }
129
131
 
130
- def get_next(self, use_auth=False):
131
- # Round-robin selection, optionally filtering out authenticated proxies
132
- if not self.entries:
133
- raise NoWorkingProxiesError("No proxies available")
134
-
135
- pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
136
- if not pool:
137
- pool = self.entries
138
-
139
- # Start from the next index after current_index
140
- start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
141
-
142
- # Find next in pool starting from start_idx
143
- for i in range(len(self.entries)):
144
- idx = (start_idx + i) % len(self.entries)
145
- entry = self.entries[idx]
146
- if entry in pool:
147
- self.current_index = idx # Update to selected index
148
- return entry
149
-
150
- # Fallback to first entry
151
- self.current_index = 0
152
- return self.entries[0]
153
-
154
- def get_random(self, use_auth=False):
155
- # Random selection, optionally filtering out authenticated proxies
156
- if not self.entries:
157
- raise NoWorkingProxiesError("No proxies available")
158
-
159
- pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
160
- if not pool:
161
- pool = self.entries
162
-
163
- entry = random.choice(pool) # noqa: S311
164
- # Update index to selected entry
165
- try:
166
- pos = self.entries.index(entry)
167
- self.current_index = pos
168
- except ValueError:
169
- pass
170
-
171
- return entry
172
-
173
132
  def check_current_ip(self, proxies=None):
174
133
  try:
175
134
  proxies_arg = proxies or {"http": self.proxies["http"]}
@@ -195,9 +154,9 @@ class ProxyInterface:
195
154
  logger.error("Failed to renew Tor IP")
196
155
  logger.error(ex)
197
156
 
198
- def wait_for_new_ip(self, timeout=600, interval=5, check_timeout=5):
157
+ def wait_for_new_ip(self, timeout=PROXY_ROTATION_TIMEOUT_SECONDS, interval=30, check_timeout=5):
199
158
  """
200
- Wait for the IP address of the currently selected proxy to change.
159
+ Ensures that the IP address of the selected proxy differs from any other proxy chosen within the proxy rotation interval.
201
160
 
202
161
  :param timeout: Max seconds to wait for IP change
203
162
  :param interval: Seconds between IP checks
@@ -224,24 +183,36 @@ class ProxyInterface:
224
183
  else:
225
184
  raise RuntimeError("Could not select a proxy for IP waiting")
226
185
 
227
- # Auto-detect baseline IP
228
- host, port, user, pwd = entry
229
- proxies_map = {
230
- "http": self.get_proxy_url(host, port, user, pwd, "http"),
231
- "https": self.get_proxy_url(host, port, user, pwd, "http"),
232
- }
233
- try:
234
- resp = requests.get(self.CHECK_IP_URL, proxies=proxies_map, timeout=check_timeout)
235
- baseline = resp.json().get("YourFuckingIPAddress")
236
- if not baseline:
237
- raise RuntimeError(f"Could not determine baseline IP for entry {host}:{port}")
238
- logger.debug(f"Auto-detected baseline IP: {baseline}")
239
- except Exception as ex:
240
- raise RuntimeError(f"Could not determine baseline IP for entry {host}:{port}: {ex}") from ex
186
+ # Check if we should skip waiting based on global cooldown and traversal cycle
187
+ now = time.time()
188
+ interval_seconds = PROXY_ROTATION_INTERVAL.total_seconds()
189
+ for last_wait in self._last_ip_wait.values():
190
+ last_ts, last_cycle = last_wait
191
+
192
+ time_recent = last_ts is not None and (now - last_ts) <= interval_seconds
193
+ no_full_rotation = self._traversal_cycle <= last_cycle
194
+
195
+ # Skip only if both conditions are true: recent wait AND no full traversal cycle since
196
+ if time_recent and no_full_rotation:
197
+ logger.debug(
198
+ "Skipping wait_for_new_ip: last wait %.1fs ago and no full traversal since (last_cycle=%s current=%s)",
199
+ now - last_ts,
200
+ last_cycle,
201
+ self._traversal_cycle,
202
+ )
203
+ return
204
+
205
+ # Mark we are now attempting to wait for this entry
206
+ self._last_ip_wait[entry] = (now, self._traversal_cycle)
241
207
 
208
+ # Try to use cached baseline IP from health check
209
+ health = self._health.get(entry, {})
210
+ baseline = health.get("last_ip")
211
+ if baseline is None:
212
+ raise RuntimeError(f"Could not determine baseline IP for entry {entry[0]}:{entry[1]}")
242
213
  return self._wait_for_new_ip(entry, baseline, timeout, interval, check_timeout)
243
214
 
244
- def mark_entry_status(self, entry, ok, error=None, last_ip=None):
215
+ def _mark_entry_status(self, entry, ok, error=None, last_ip=None):
245
216
  """Update health cache for an entry."""
246
217
  self._health[entry] = {
247
218
  "ok": ok,
@@ -250,7 +221,7 @@ class ProxyInterface:
250
221
  "last_ip": last_ip,
251
222
  }
252
223
 
253
- def is_entry_alive(self, entry, timeout=5):
224
+ def _is_entry_alive(self, entry, timeout=5):
254
225
  """Check if a proxy entry is working by making a test request."""
255
226
  host, port, user, pwd = entry
256
227
  try:
@@ -261,10 +232,10 @@ class ProxyInterface:
261
232
  resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=timeout)
262
233
  ok = resp.status_code == 200
263
234
  last_ip = resp.json().get("YourFuckingIPAddress") if ok else None
264
- self.mark_entry_status(entry, ok, last_ip=last_ip)
235
+ self._mark_entry_status(entry, ok, last_ip=last_ip)
265
236
  return ok
266
237
  except Exception as ex:
267
- self.mark_entry_status(entry, False, str(ex))
238
+ self._mark_entry_status(entry, False, str(ex))
268
239
  return False
269
240
 
270
241
  def _get_working_entry(
@@ -272,49 +243,30 @@ class ProxyInterface:
272
243
  use_auth=False,
273
244
  randomize=False,
274
245
  check_timeout=5,
275
- cooldown_seconds=60,
276
- ensure_new_ip=False,
277
- ensure_new_ip_timeout=600,
278
- ensure_new_ip_interval=5,
279
- max_retry_seconds=600,
246
+ cooldown_seconds=30,
247
+ proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
280
248
  ):
281
249
  """Get a working proxy entry, performing health checks as needed."""
282
250
  pool = self._build_pool(use_auth)
283
- candidates = self._get_candidates(pool, randomize)
284
-
285
- # Capture baseline before any health checks that might set it
286
- baseline_before = None
287
- if len(candidates) == 1:
288
- idx = candidates[0]
289
- entry = self.entries[idx]
290
- baseline_before = self._health.get(entry, {}).get("last_ip")
251
+ self._refresh_traversal_queue(pool, randomize)
291
252
 
292
253
  def _find_working_entry():
293
254
  if not self.entries:
294
255
  raise NoWorkingProxiesError("No proxies available")
295
- return self._find_working_entry_once(candidates, check_timeout, cooldown_seconds)
256
+ return self._find_working_entry_once(check_timeout, cooldown_seconds)
296
257
 
297
- if max_retry_seconds > 0:
258
+ # Handle both timedelta and numeric seconds for backward compatibility
259
+ if proxy_rotation_interval:
298
260
  retrying = tenacity.Retrying(
299
- stop=tenacity.stop_after_delay(max_retry_seconds),
261
+ wait=tenacity.wait_fixed(cooldown_seconds),
262
+ stop=tenacity.stop_after_delay(proxy_rotation_interval),
263
+ before_sleep=tenacity.before_sleep_log(logger, logging.INFO),
300
264
  reraise=True,
301
265
  )
302
266
  entry = retrying(_find_working_entry)
303
267
  else:
304
268
  entry = _find_working_entry()
305
269
 
306
- if ensure_new_ip and len(pool) == 1:
307
- baseline = self._health.get(entry, {}).get("last_ip")
308
- if not baseline_before:
309
- # First time seeing this proxy: it was already checked in _find_working_entry, so return immediately
310
- logger.debug("No baseline IP found; returning first-working proxy without waiting for IP change")
311
- else:
312
- # There is a baseline: wait for the IP to change
313
- logger.info(f"ensure_new_ip=True and single proxy, waiting for IP change: {entry[0]}:{entry[1]}")
314
- entry = self._wait_for_new_ip(
315
- entry, baseline, ensure_new_ip_timeout, ensure_new_ip_interval, check_timeout
316
- )
317
-
318
270
  return entry
319
271
 
320
272
  def _get_round_robin_candidates(self, pool):
@@ -334,15 +286,29 @@ class ProxyInterface:
334
286
  pool = self.entries
335
287
  return pool
336
288
 
337
- def _get_candidates(self, pool, randomize):
338
- if randomize:
339
- candidates = [idx for idx, entry in enumerate(self.entries) if entry in pool]
340
- return random.sample(candidates, k=len(candidates))
341
- else:
342
- return self._get_round_robin_candidates(pool)
289
+ def _refresh_traversal_queue(self, pool, randomize):
290
+ # Build current pool indices
291
+ current_pool_indices = [idx for idx, entry in enumerate(self.entries) if entry in pool]
343
292
 
344
- def _find_working_entry_once(self, candidates, check_timeout, cooldown_seconds):
345
- for idx in candidates:
293
+ # Check if we need to refill the traversal queue
294
+ if not self._traversal_queue and current_pool_indices:
295
+ if randomize:
296
+ self._traversal_queue = current_pool_indices.copy()
297
+ random.shuffle(self._traversal_queue)
298
+ else:
299
+ # Round-robin: start from next after current_index
300
+ self._traversal_queue = []
301
+ start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
302
+ for i in range(len(self.entries)):
303
+ idx = (start_idx + i) % len(self.entries)
304
+ if idx in current_pool_indices:
305
+ self._traversal_queue.append(idx)
306
+ self._traversal_start = time.time()
307
+ self._traversal_cycle += 1
308
+
309
+ def _find_working_entry_once(self, check_timeout, cooldown_seconds):
310
+ # Consume from traversal queue for cached checks
311
+ for idx in self._traversal_queue:
346
312
  entry = self.entries[idx]
347
313
  health = self._health.get(entry, {})
348
314
  last_checked = health.get("last_checked", 0)
@@ -352,26 +318,21 @@ class ProxyInterface:
352
318
  if ok and (now - last_checked) < cooldown_seconds:
353
319
  logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
354
320
  self.current_index = idx
321
+ self._traversal_queue.remove(idx)
355
322
  return entry
356
323
  elif not ok and (now - last_checked) < cooldown_seconds:
357
324
  continue
358
325
  else:
359
326
  logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
360
- if self.is_entry_alive(entry, timeout=check_timeout):
327
+ if self._is_entry_alive(entry, timeout=check_timeout):
361
328
  self.current_index = idx
329
+ self._traversal_queue.remove(idx)
362
330
  return entry
363
331
 
364
- logger.warning("No cached working proxies, forcing fresh checks")
365
- for idx in candidates:
366
- entry = self.entries[idx]
367
- logger.debug(f"Force-checking proxy: {entry[0]}:{entry[1]}")
368
- if self.is_entry_alive(entry, timeout=check_timeout):
369
- self.current_index = idx
370
- return entry
371
-
372
332
  raise NoWorkingProxiesError("No working proxies available")
373
333
 
374
334
  def _wait_for_new_ip(self, entry, baseline, timeout, interval, check_timeout):
335
+ logger.info("Refreshing proxy IP...")
375
336
  start = time.time()
376
337
  while time.time() - start < timeout:
377
338
  host, port, user, pwd = entry
@@ -386,9 +347,9 @@ class ProxyInterface:
386
347
  current_ip = None
387
348
 
388
349
  if current_ip and current_ip != baseline:
389
- self.mark_entry_status(entry, True, last_ip=current_ip)
350
+ self._mark_entry_status(entry, True, last_ip=current_ip)
390
351
  logger.info(f"IP changed from {baseline} to {current_ip}")
391
- return entry
352
+ return
392
353
 
393
354
  time.sleep(interval)
394
355
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.106
3
+ Version: 0.7.108
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -9,7 +9,7 @@ datamarket/interfaces/drive.py,sha256=3nhx3THr2SHNWKYwme9F2nPpvsqyEMFIxz0whF2FjH
9
9
  datamarket/interfaces/ftp.py,sha256=LH3Oz19k_xUNhzDXcrq5Ofb4c3uiph5pWUqpgiaDvHI,2671
10
10
  datamarket/interfaces/nominatim.py,sha256=xizT94tVum7QPppfDgI5sEhx1mAXT-SM3JyPl8CDxxU,15148
11
11
  datamarket/interfaces/peerdb.py,sha256=sO451wEGNb_0DDwchZ6eBVYKltqHM5XKau-WsfspXzA,23640
12
- datamarket/interfaces/proxy.py,sha256=e-bbmtjyjkh4ZAuk6o0nm-UcynH4qmDZnzDQzbOasb8,16063
12
+ datamarket/interfaces/proxy.py,sha256=T5imj-f2-ZIy7TM8UhkkUWAAdlEbJG95KHohoYgpZEo,14903
13
13
  datamarket/interfaces/tinybird.py,sha256=cNG-kAPTdQn2inlNX9LPf-VVdtnLud947ApLVO40Now,2594
14
14
  datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  datamarket/params/nominatim.py,sha256=S9TEB4FxmffvFyK9KffWl20TfXzWX69IAdbEehKar1I,11920
@@ -29,7 +29,7 @@ datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnm
29
29
  datamarket/utils/strings/standardization.py,sha256=j_NbT-O1XnxDvDhct8panfkrfAC8R5OX6XM5fYBZ4RU,1496
30
30
  datamarket/utils/typer.py,sha256=geWuwMwGQjBQhxo27hX0vEAeRl1j1TS0u2oFVfpAs5I,816
31
31
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
32
- datamarket-0.7.106.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
- datamarket-0.7.106.dist-info/METADATA,sha256=A76E203rT92P2sSWe_FpjyL4I3vqirVPWHOFAXYlZug,7382
34
- datamarket-0.7.106.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
35
- datamarket-0.7.106.dist-info/RECORD,,
32
+ datamarket-0.7.108.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
+ datamarket-0.7.108.dist-info/METADATA,sha256=nFR_qzIVAlSc3L_0P2D1fWN7GdQTTINVZ_AkCDmymgc,7382
34
+ datamarket-0.7.108.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
35
+ datamarket-0.7.108.dist-info/RECORD,,