datamarket 0.7.105__tar.gz → 0.7.107__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (35) hide show
  1. {datamarket-0.7.105 → datamarket-0.7.107}/PKG-INFO +1 -1
  2. {datamarket-0.7.105 → datamarket-0.7.107}/pyproject.toml +1 -1
  3. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/interfaces/proxy.py +122 -121
  4. {datamarket-0.7.105 → datamarket-0.7.107}/LICENSE +0 -0
  5. {datamarket-0.7.105 → datamarket-0.7.107}/README.md +0 -0
  6. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/__init__.py +0 -0
  7. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/exceptions/__init__.py +0 -0
  8. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/exceptions/main.py +0 -0
  9. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/interfaces/__init__.py +0 -0
  10. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/interfaces/alchemy.py +0 -0
  11. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/interfaces/aws.py +0 -0
  12. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/interfaces/azure.py +0 -0
  13. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/interfaces/drive.py +0 -0
  14. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/interfaces/ftp.py +0 -0
  15. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/interfaces/nominatim.py +0 -0
  16. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/interfaces/peerdb.py +0 -0
  17. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/interfaces/tinybird.py +0 -0
  18. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/params/__init__.py +0 -0
  19. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/params/nominatim.py +0 -0
  20. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/__init__.py +0 -0
  21. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/airflow.py +0 -0
  22. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/alchemy.py +0 -0
  23. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/main.py +0 -0
  24. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/nominatim.py +0 -0
  25. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/playwright/__init__.py +0 -0
  26. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/playwright/async_api.py +0 -0
  27. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/playwright/sync_api.py +0 -0
  28. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/selenium.py +0 -0
  29. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/soda.py +0 -0
  30. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/strings/__init__.py +0 -0
  31. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/strings/normalization.py +0 -0
  32. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/strings/obfuscation.py +0 -0
  33. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/strings/standardization.py +0 -0
  34. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/typer.py +0 -0
  35. {datamarket-0.7.105 → datamarket-0.7.107}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.105
3
+ Version: 0.7.107
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.7.105"
3
+ version = "0.7.107"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import random
3
3
  import time
4
+ from datetime import timedelta
4
5
 
5
6
  import requests
6
7
  import tenacity
@@ -12,6 +13,9 @@ from datamarket.exceptions import EnsureNewIPTimeoutError, NoWorkingProxiesError
12
13
  logger = logging.getLogger(__name__)
13
14
  logging.getLogger("stem").setLevel(logging.WARNING)
14
15
 
16
+ PROXY_ROTATION_INTERVAL = timedelta(minutes=10)
17
+ PROXY_ROTATION_TIMEOUT_SECONDS = int(PROXY_ROTATION_INTERVAL.total_seconds())
18
+
15
19
 
16
20
  class ProxyInterface:
17
21
  """
@@ -22,8 +26,12 @@ class ProxyInterface:
22
26
 
23
27
  def __init__(self, config):
24
28
  self._load_from_config(config)
25
- self.current_index = random.randrange(len(self.entries)) if self.entries else 0 # noqa: S311
29
+ self.current_index = -2 # -2 means no selection made yet, -1 means Tor selected
26
30
  self._health = {} # {entry: {"ok": bool, "last_checked": time.time(), "last_error": str}}
31
+ self._traversal_queue = [] # Queue of indices left to test in current traversal
32
+ self._traversal_start = None # Timestamp when current traversal started
33
+ self._last_ip_wait = {} # {entry: (timestamp, traversal_cycle)} - last time we attempted to wait for IP change
34
+ self._traversal_cycle = 0 # Counter of full traversals of the queue
27
35
 
28
36
  def _load_from_config(self, cfg):
29
37
  # Tor password (optional)
@@ -67,13 +75,9 @@ class ProxyInterface:
67
75
  raw=False,
68
76
  use_auth=False,
69
77
  use_socks=False,
70
- health_check=True,
71
78
  check_timeout=5,
72
- cooldown_seconds=600,
73
- ensure_new_ip=False,
74
- ensure_new_ip_timeout=600,
75
- ensure_new_ip_interval=5,
76
- max_retry_seconds=600,
79
+ cooldown_seconds=30,
80
+ proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
77
81
  ):
78
82
  """
79
83
  Return parsed proxy URLs or raw entry tuple for a working proxy.
@@ -82,35 +86,25 @@ class ProxyInterface:
82
86
  :param randomize: select a random proxy if True, otherwise round-robin
83
87
  :param raw: return raw (host, port, user, password) tuple if True
84
88
  :param use_auth: include proxies that require authentication if True; otherwise only credential-free
85
- :param health_check: perform health checks to ensure proxy is working if True
86
89
  :param check_timeout: timeout in seconds for health check requests
87
90
  :param cooldown_seconds: how long to cache health status before re-checking
88
- :param ensure_new_ip: if True and only one proxy available, wait until IP changes before returning
89
- :param ensure_new_ip_timeout: max seconds to wait for IP change when ensure_new_ip=True
90
- :param ensure_new_ip_interval: seconds between IP checks when ensure_new_ip=True
91
- :param max_retry_seconds: max seconds to retry finding working proxies (0 to disable)
91
+ :param proxy_rotation_interval: max time to retry finding working proxies (timedelta or seconds, 0 to disable)
92
92
  """
93
93
  # Tor handling (skip health check for tor)
94
94
  if use_tor:
95
+ self.current_index = -1 # Indicate Tor is selected
95
96
  if raw:
96
97
  return ("127.0.0.1", "9050", None, None)
97
98
  return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
98
99
 
99
100
  # Get a working entry (with health checks if enabled)
100
- if health_check:
101
- host, port, user, password = self._get_working_entry(
102
- use_auth=use_auth,
103
- randomize=randomize,
104
- check_timeout=check_timeout,
105
- cooldown_seconds=cooldown_seconds,
106
- ensure_new_ip=ensure_new_ip,
107
- ensure_new_ip_timeout=ensure_new_ip_timeout,
108
- ensure_new_ip_interval=ensure_new_ip_interval,
109
- max_retry_seconds=max_retry_seconds,
110
- )
111
- else:
112
- # Legacy behavior: no health check
113
- host, port, user, password = self.get_random(use_auth) if randomize else self.get_next(use_auth)
101
+ host, port, user, password = self._get_working_entry(
102
+ use_auth=use_auth,
103
+ randomize=randomize,
104
+ check_timeout=check_timeout,
105
+ cooldown_seconds=cooldown_seconds,
106
+ proxy_rotation_interval=proxy_rotation_interval,
107
+ )
114
108
 
115
109
  if raw:
116
110
  return host, port, user, password
@@ -126,45 +120,6 @@ class ProxyInterface:
126
120
  "https": self.get_proxy_url(host, port, user, password, "http"),
127
121
  }
128
122
 
129
- def get_next(self, use_auth=False):
130
- # Round-robin selection, optionally filtering out authenticated proxies
131
- if not self.entries:
132
- raise NoWorkingProxiesError("No proxies available")
133
-
134
- pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
135
- if not pool:
136
- pool = self.entries
137
-
138
- # Find next in pool using current_index
139
- for _ in range(len(self.entries)):
140
- idx = self.current_index
141
- self.current_index = (self.current_index + 1) % len(self.entries)
142
- entry = self.entries[idx]
143
- if entry in pool:
144
- return entry
145
-
146
- # Fallback to first entry
147
- return self.entries[0]
148
-
149
- def get_random(self, use_auth=False):
150
- # Random selection, optionally filtering out authenticated proxies
151
- if not self.entries:
152
- raise NoWorkingProxiesError("No proxies available")
153
-
154
- pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
155
- if not pool:
156
- pool = self.entries
157
-
158
- entry = random.choice(pool) # noqa: S311
159
- # Update index to after selected entry for round-robin continuity
160
- try:
161
- pos = self.entries.index(entry)
162
- self.current_index = (pos + 1) % len(self.entries)
163
- except ValueError:
164
- pass
165
-
166
- return entry
167
-
168
123
  def check_current_ip(self, proxies=None):
169
124
  try:
170
125
  proxies_arg = proxies or {"http": self.proxies["http"]}
@@ -190,7 +145,65 @@ class ProxyInterface:
190
145
  logger.error("Failed to renew Tor IP")
191
146
  logger.error(ex)
192
147
 
193
- def mark_entry_status(self, entry, ok, error=None, last_ip=None):
148
+ def wait_for_new_ip(self, timeout=PROXY_ROTATION_TIMEOUT_SECONDS, interval=30, check_timeout=5):
149
+ """
150
+ Ensures that the IP address of the selected proxy differs from any other proxy chosen within the proxy rotation interval.
151
+
152
+ :param timeout: Max seconds to wait for IP change
153
+ :param interval: Seconds between IP checks
154
+ :param check_timeout: Timeout for individual IP check requests
155
+ :return: The selected entry (unchanged)
156
+ :raises RuntimeError: If no proxy is available or baseline cannot be determined
157
+ :raises EnsureNewIPTimeoutError: If IP doesn't change within timeout
158
+ """
159
+ # Use currently selected proxy
160
+ if self.current_index == -1:
161
+ # Tor is selected
162
+ entry = ("127.0.0.1", "9050", None, None)
163
+ elif self.current_index >= 0 and self.current_index < len(self.entries):
164
+ # current_index points to the selected entry
165
+ entry = self.entries[self.current_index]
166
+ else:
167
+ # No valid selection, select one
168
+ logger.debug("No proxy currently selected, selecting one for IP waiting")
169
+ self.get_proxies(raw=True)
170
+ if self.current_index == -1:
171
+ entry = ("127.0.0.1", "9050", None, None)
172
+ elif self.current_index >= 0 and self.current_index < len(self.entries):
173
+ entry = self.entries[self.current_index]
174
+ else:
175
+ raise RuntimeError("Could not select a proxy for IP waiting")
176
+
177
+ # Check if we should skip waiting based on global cooldown and traversal cycle
178
+ now = time.time()
179
+ interval_seconds = PROXY_ROTATION_INTERVAL.total_seconds()
180
+ for last_wait in self._last_ip_wait.values():
181
+ last_ts, last_cycle = last_wait
182
+
183
+ time_recent = last_ts is not None and (now - last_ts) <= interval_seconds
184
+ no_full_rotation = self._traversal_cycle <= last_cycle
185
+
186
+ # Skip only if both conditions are true: recent wait AND no full traversal cycle since
187
+ if time_recent and no_full_rotation:
188
+ logger.debug(
189
+ "Skipping wait_for_new_ip: last wait %.1fs ago and no full traversal since (last_cycle=%s current=%s)",
190
+ now - last_ts,
191
+ last_cycle,
192
+ self._traversal_cycle,
193
+ )
194
+ return
195
+
196
+ # Mark we are now attempting to wait for this entry
197
+ self._last_ip_wait[entry] = (now, self._traversal_cycle)
198
+
199
+ # Try to use cached baseline IP from health check
200
+ health = self._health.get(entry, {})
201
+ baseline = health.get("last_ip")
202
+ if baseline is None:
203
+ raise RuntimeError(f"Could not determine baseline IP for entry {entry[0]}:{entry[1]}")
204
+ return self._wait_for_new_ip(entry, baseline, timeout, interval, check_timeout)
205
+
206
+ def _mark_entry_status(self, entry, ok, error=None, last_ip=None):
194
207
  """Update health cache for an entry."""
195
208
  self._health[entry] = {
196
209
  "ok": ok,
@@ -199,7 +212,7 @@ class ProxyInterface:
199
212
  "last_ip": last_ip,
200
213
  }
201
214
 
202
- def is_entry_alive(self, entry, timeout=5):
215
+ def _is_entry_alive(self, entry, timeout=5):
203
216
  """Check if a proxy entry is working by making a test request."""
204
217
  host, port, user, pwd = entry
205
218
  try:
@@ -210,10 +223,10 @@ class ProxyInterface:
210
223
  resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=timeout)
211
224
  ok = resp.status_code == 200
212
225
  last_ip = resp.json().get("YourFuckingIPAddress") if ok else None
213
- self.mark_entry_status(entry, ok, last_ip=last_ip)
226
+ self._mark_entry_status(entry, ok, last_ip=last_ip)
214
227
  return ok
215
228
  except Exception as ex:
216
- self.mark_entry_status(entry, False, str(ex))
229
+ self._mark_entry_status(entry, False, str(ex))
217
230
  return False
218
231
 
219
232
  def _get_working_entry(
@@ -221,65 +234,39 @@ class ProxyInterface:
221
234
  use_auth=False,
222
235
  randomize=False,
223
236
  check_timeout=5,
224
- cooldown_seconds=60,
225
- ensure_new_ip=False,
226
- ensure_new_ip_timeout=600,
227
- ensure_new_ip_interval=5,
228
- max_retry_seconds=600,
237
+ cooldown_seconds=30,
238
+ proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
229
239
  ):
230
240
  """Get a working proxy entry, performing health checks as needed."""
231
241
  pool = self._build_pool(use_auth)
232
- candidates = self._get_candidates(pool, randomize)
233
-
234
- # Capture baseline before any health checks that might set it
235
- baseline_before = None
236
- if len(candidates) == 1:
237
- baseline_before = self._health.get(candidates[0], {}).get("last_ip")
242
+ self._refresh_traversal_queue(pool, randomize)
238
243
 
239
244
  def _find_working_entry():
240
245
  if not self.entries:
241
246
  raise NoWorkingProxiesError("No proxies available")
242
- return self._find_working_entry_once(candidates, check_timeout, cooldown_seconds)
247
+ return self._find_working_entry_once(check_timeout, cooldown_seconds)
243
248
 
244
- if max_retry_seconds > 0:
249
+ # Handle both timedelta and numeric seconds for backward compatibility
250
+ if proxy_rotation_interval:
245
251
  retrying = tenacity.Retrying(
246
- stop=tenacity.stop_after_delay(max_retry_seconds),
252
+ stop=tenacity.stop_after_delay(proxy_rotation_interval),
247
253
  reraise=True,
248
254
  )
249
255
  entry = retrying(_find_working_entry)
250
256
  else:
251
257
  entry = _find_working_entry()
252
258
 
253
- if ensure_new_ip and len(pool) == 1:
254
- logger.debug(f"ensure_new_ip=True and single proxy, handling IP change check: {entry[0]}:{entry[1]}")
255
- baseline = self._health.get(entry, {}).get("last_ip")
256
- if not baseline_before:
257
- # First time seeing this proxy: it was already checked in _find_working_entry, so return immediately
258
- logger.debug("No baseline IP found; returning first-working proxy without waiting for IP change")
259
- else:
260
- # There is a baseline: wait for the IP to change
261
- entry = self._wait_for_new_ip(
262
- entry, baseline, ensure_new_ip_timeout, ensure_new_ip_interval, check_timeout
263
- )
264
-
265
259
  return entry
266
260
 
267
261
  def _get_round_robin_candidates(self, pool):
268
262
  """Get candidates in round-robin order starting from current_index."""
269
263
  candidates = []
270
- start_idx = self.current_index
264
+ start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
271
265
  for i in range(len(self.entries)):
272
266
  idx = (start_idx + i) % len(self.entries)
273
267
  entry = self.entries[idx]
274
268
  if entry in pool:
275
- candidates.append(entry)
276
- # Update current_index for next call
277
- if candidates:
278
- try:
279
- pos = self.entries.index(candidates[0])
280
- self.current_index = (pos + 1) % len(self.entries)
281
- except ValueError:
282
- pass
269
+ candidates.append(idx)
283
270
  return candidates
284
271
 
285
272
  def _build_pool(self, use_auth):
@@ -288,14 +275,30 @@ class ProxyInterface:
288
275
  pool = self.entries
289
276
  return pool
290
277
 
291
- def _get_candidates(self, pool, randomize):
292
- if randomize:
293
- return pool[:]
294
- else:
295
- return self._get_round_robin_candidates(pool)
278
+ def _refresh_traversal_queue(self, pool, randomize):
279
+ # Build current pool indices
280
+ current_pool_indices = [idx for idx, entry in enumerate(self.entries) if entry in pool]
296
281
 
297
- def _find_working_entry_once(self, candidates, check_timeout, cooldown_seconds):
298
- for entry in candidates:
282
+ # Check if we need to refill the traversal queue
283
+ if not self._traversal_queue and current_pool_indices:
284
+ if randomize:
285
+ self._traversal_queue = current_pool_indices.copy()
286
+ random.shuffle(self._traversal_queue)
287
+ else:
288
+ # Round-robin: start from next after current_index
289
+ self._traversal_queue = []
290
+ start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
291
+ for i in range(len(self.entries)):
292
+ idx = (start_idx + i) % len(self.entries)
293
+ if idx in current_pool_indices:
294
+ self._traversal_queue.append(idx)
295
+ self._traversal_start = time.time()
296
+ self._traversal_cycle += 1
297
+
298
+ def _find_working_entry_once(self, check_timeout, cooldown_seconds):
299
+ # Consume from traversal queue for cached checks
300
+ for idx in self._traversal_queue:
301
+ entry = self.entries[idx]
299
302
  health = self._health.get(entry, {})
300
303
  last_checked = health.get("last_checked", 0)
301
304
  ok = health.get("ok", False)
@@ -303,20 +306,18 @@ class ProxyInterface:
303
306
 
304
307
  if ok and (now - last_checked) < cooldown_seconds:
305
308
  logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
309
+ self.current_index = idx
310
+ self._traversal_queue.remove(idx)
306
311
  return entry
307
312
  elif not ok and (now - last_checked) < cooldown_seconds:
308
313
  continue
309
314
  else:
310
315
  logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
311
- if self.is_entry_alive(entry, timeout=check_timeout):
316
+ if self._is_entry_alive(entry, timeout=check_timeout):
317
+ self.current_index = idx
318
+ self._traversal_queue.remove(idx)
312
319
  return entry
313
320
 
314
- logger.warning("No cached working proxies, forcing fresh checks")
315
- for entry in candidates:
316
- logger.debug(f"Force-checking proxy: {entry[0]}:{entry[1]}")
317
- if self.is_entry_alive(entry, timeout=check_timeout):
318
- return entry
319
-
320
321
  raise NoWorkingProxiesError("No working proxies available")
321
322
 
322
323
  def _wait_for_new_ip(self, entry, baseline, timeout, interval, check_timeout):
@@ -334,9 +335,9 @@ class ProxyInterface:
334
335
  current_ip = None
335
336
 
336
337
  if current_ip and current_ip != baseline:
337
- self.mark_entry_status(entry, True, last_ip=current_ip)
338
- logger.debug(f"IP changed from {baseline} to {current_ip}")
339
- return entry
338
+ self._mark_entry_status(entry, True, last_ip=current_ip)
339
+ logger.info(f"IP changed from {baseline} to {current_ip}")
340
+ return
340
341
 
341
342
  time.sleep(interval)
342
343
 
File without changes
File without changes