datamarket 0.7.106__tar.gz → 0.7.107__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (35) hide show
  1. {datamarket-0.7.106 → datamarket-0.7.107}/PKG-INFO +1 -1
  2. {datamarket-0.7.106 → datamarket-0.7.107}/pyproject.toml +1 -1
  3. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/interfaces/proxy.py +84 -135
  4. {datamarket-0.7.106 → datamarket-0.7.107}/LICENSE +0 -0
  5. {datamarket-0.7.106 → datamarket-0.7.107}/README.md +0 -0
  6. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/__init__.py +0 -0
  7. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/exceptions/__init__.py +0 -0
  8. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/exceptions/main.py +0 -0
  9. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/interfaces/__init__.py +0 -0
  10. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/interfaces/alchemy.py +0 -0
  11. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/interfaces/aws.py +0 -0
  12. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/interfaces/azure.py +0 -0
  13. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/interfaces/drive.py +0 -0
  14. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/interfaces/ftp.py +0 -0
  15. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/interfaces/nominatim.py +0 -0
  16. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/interfaces/peerdb.py +0 -0
  17. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/interfaces/tinybird.py +0 -0
  18. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/params/__init__.py +0 -0
  19. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/params/nominatim.py +0 -0
  20. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/__init__.py +0 -0
  21. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/airflow.py +0 -0
  22. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/alchemy.py +0 -0
  23. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/main.py +0 -0
  24. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/nominatim.py +0 -0
  25. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/playwright/__init__.py +0 -0
  26. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/playwright/async_api.py +0 -0
  27. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/playwright/sync_api.py +0 -0
  28. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/selenium.py +0 -0
  29. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/soda.py +0 -0
  30. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/strings/__init__.py +0 -0
  31. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/strings/normalization.py +0 -0
  32. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/strings/obfuscation.py +0 -0
  33. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/strings/standardization.py +0 -0
  34. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/typer.py +0 -0
  35. {datamarket-0.7.106 → datamarket-0.7.107}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.106
3
+ Version: 0.7.107
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.7.106"
3
+ version = "0.7.107"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import random
3
3
  import time
4
+ from datetime import timedelta
4
5
 
5
6
  import requests
6
7
  import tenacity
@@ -12,6 +13,9 @@ from datamarket.exceptions import EnsureNewIPTimeoutError, NoWorkingProxiesError
12
13
  logger = logging.getLogger(__name__)
13
14
  logging.getLogger("stem").setLevel(logging.WARNING)
14
15
 
16
+ PROXY_ROTATION_INTERVAL = timedelta(minutes=10)
17
+ PROXY_ROTATION_TIMEOUT_SECONDS = int(PROXY_ROTATION_INTERVAL.total_seconds())
18
+
15
19
 
16
20
  class ProxyInterface:
17
21
  """
@@ -24,6 +28,10 @@ class ProxyInterface:
24
28
  self._load_from_config(config)
25
29
  self.current_index = -2 # -2 means no selection made yet, -1 means Tor selected
26
30
  self._health = {} # {entry: {"ok": bool, "last_checked": time.time(), "last_error": str}}
31
+ self._traversal_queue = [] # Queue of indices left to test in current traversal
32
+ self._traversal_start = None # Timestamp when current traversal started
33
+ self._last_ip_wait = {} # {entry: (timestamp, traversal_cycle)} - last time we attempted to wait for IP change
34
+ self._traversal_cycle = 0 # Counter of full traversals of the queue
27
35
 
28
36
  def _load_from_config(self, cfg):
29
37
  # Tor password (optional)
@@ -67,13 +75,9 @@ class ProxyInterface:
67
75
  raw=False,
68
76
  use_auth=False,
69
77
  use_socks=False,
70
- health_check=True,
71
78
  check_timeout=5,
72
- cooldown_seconds=600,
73
- ensure_new_ip=False,
74
- ensure_new_ip_timeout=600,
75
- ensure_new_ip_interval=5,
76
- max_retry_seconds=600,
79
+ cooldown_seconds=30,
80
+ proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
77
81
  ):
78
82
  """
79
83
  Return parsed proxy URLs or raw entry tuple for a working proxy.
@@ -82,13 +86,9 @@ class ProxyInterface:
82
86
  :param randomize: select a random proxy if True, otherwise round-robin
83
87
  :param raw: return raw (host, port, user, password) tuple if True
84
88
  :param use_auth: include proxies that require authentication if True; otherwise only credential-free
85
- :param health_check: perform health checks to ensure proxy is working if True
86
89
  :param check_timeout: timeout in seconds for health check requests
87
90
  :param cooldown_seconds: how long to cache health status before re-checking
88
- :param ensure_new_ip: if True and only one proxy available, wait until IP changes before returning
89
- :param ensure_new_ip_timeout: max seconds to wait for IP change when ensure_new_ip=True
90
- :param ensure_new_ip_interval: seconds between IP checks when ensure_new_ip=True
91
- :param max_retry_seconds: max seconds to retry finding working proxies (0 to disable)
91
+ :param proxy_rotation_interval: max time to retry finding working proxies (timedelta or seconds, 0 to disable)
92
92
  """
93
93
  # Tor handling (skip health check for tor)
94
94
  if use_tor:
@@ -98,20 +98,13 @@ class ProxyInterface:
98
98
  return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
99
99
 
100
100
  # Get a working entry (with health checks if enabled)
101
- if health_check:
102
- host, port, user, password = self._get_working_entry(
103
- use_auth=use_auth,
104
- randomize=randomize,
105
- check_timeout=check_timeout,
106
- cooldown_seconds=cooldown_seconds,
107
- ensure_new_ip=ensure_new_ip,
108
- ensure_new_ip_timeout=ensure_new_ip_timeout,
109
- ensure_new_ip_interval=ensure_new_ip_interval,
110
- max_retry_seconds=max_retry_seconds,
111
- )
112
- else:
113
- # Legacy behavior: no health check
114
- host, port, user, password = self.get_random(use_auth) if randomize else self.get_next(use_auth)
101
+ host, port, user, password = self._get_working_entry(
102
+ use_auth=use_auth,
103
+ randomize=randomize,
104
+ check_timeout=check_timeout,
105
+ cooldown_seconds=cooldown_seconds,
106
+ proxy_rotation_interval=proxy_rotation_interval,
107
+ )
115
108
 
116
109
  if raw:
117
110
  return host, port, user, password
@@ -127,49 +120,6 @@ class ProxyInterface:
127
120
  "https": self.get_proxy_url(host, port, user, password, "http"),
128
121
  }
129
122
 
130
- def get_next(self, use_auth=False):
131
- # Round-robin selection, optionally filtering out authenticated proxies
132
- if not self.entries:
133
- raise NoWorkingProxiesError("No proxies available")
134
-
135
- pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
136
- if not pool:
137
- pool = self.entries
138
-
139
- # Start from the next index after current_index
140
- start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
141
-
142
- # Find next in pool starting from start_idx
143
- for i in range(len(self.entries)):
144
- idx = (start_idx + i) % len(self.entries)
145
- entry = self.entries[idx]
146
- if entry in pool:
147
- self.current_index = idx # Update to selected index
148
- return entry
149
-
150
- # Fallback to first entry
151
- self.current_index = 0
152
- return self.entries[0]
153
-
154
- def get_random(self, use_auth=False):
155
- # Random selection, optionally filtering out authenticated proxies
156
- if not self.entries:
157
- raise NoWorkingProxiesError("No proxies available")
158
-
159
- pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
160
- if not pool:
161
- pool = self.entries
162
-
163
- entry = random.choice(pool) # noqa: S311
164
- # Update index to selected entry
165
- try:
166
- pos = self.entries.index(entry)
167
- self.current_index = pos
168
- except ValueError:
169
- pass
170
-
171
- return entry
172
-
173
123
  def check_current_ip(self, proxies=None):
174
124
  try:
175
125
  proxies_arg = proxies or {"http": self.proxies["http"]}
@@ -195,9 +145,9 @@ class ProxyInterface:
195
145
  logger.error("Failed to renew Tor IP")
196
146
  logger.error(ex)
197
147
 
198
- def wait_for_new_ip(self, timeout=600, interval=5, check_timeout=5):
148
+ def wait_for_new_ip(self, timeout=PROXY_ROTATION_TIMEOUT_SECONDS, interval=30, check_timeout=5):
199
149
  """
200
- Wait for the IP address of the currently selected proxy to change.
150
+ Ensures that the IP address of the selected proxy differs from any other proxy chosen within the proxy rotation interval.
201
151
 
202
152
  :param timeout: Max seconds to wait for IP change
203
153
  :param interval: Seconds between IP checks
@@ -224,24 +174,36 @@ class ProxyInterface:
224
174
  else:
225
175
  raise RuntimeError("Could not select a proxy for IP waiting")
226
176
 
227
- # Auto-detect baseline IP
228
- host, port, user, pwd = entry
229
- proxies_map = {
230
- "http": self.get_proxy_url(host, port, user, pwd, "http"),
231
- "https": self.get_proxy_url(host, port, user, pwd, "http"),
232
- }
233
- try:
234
- resp = requests.get(self.CHECK_IP_URL, proxies=proxies_map, timeout=check_timeout)
235
- baseline = resp.json().get("YourFuckingIPAddress")
236
- if not baseline:
237
- raise RuntimeError(f"Could not determine baseline IP for entry {host}:{port}")
238
- logger.debug(f"Auto-detected baseline IP: {baseline}")
239
- except Exception as ex:
240
- raise RuntimeError(f"Could not determine baseline IP for entry {host}:{port}: {ex}") from ex
177
+ # Check if we should skip waiting based on global cooldown and traversal cycle
178
+ now = time.time()
179
+ interval_seconds = PROXY_ROTATION_INTERVAL.total_seconds()
180
+ for last_wait in self._last_ip_wait.values():
181
+ last_ts, last_cycle = last_wait
182
+
183
+ time_recent = last_ts is not None and (now - last_ts) <= interval_seconds
184
+ no_full_rotation = self._traversal_cycle <= last_cycle
185
+
186
+ # Skip only if both conditions are true: recent wait AND no full traversal cycle since
187
+ if time_recent and no_full_rotation:
188
+ logger.debug(
189
+ "Skipping wait_for_new_ip: last wait %.1fs ago and no full traversal since (last_cycle=%s current=%s)",
190
+ now - last_ts,
191
+ last_cycle,
192
+ self._traversal_cycle,
193
+ )
194
+ return
241
195
 
196
+ # Mark we are now attempting to wait for this entry
197
+ self._last_ip_wait[entry] = (now, self._traversal_cycle)
198
+
199
+ # Try to use cached baseline IP from health check
200
+ health = self._health.get(entry, {})
201
+ baseline = health.get("last_ip")
202
+ if baseline is None:
203
+ raise RuntimeError(f"Could not determine baseline IP for entry {entry[0]}:{entry[1]}")
242
204
  return self._wait_for_new_ip(entry, baseline, timeout, interval, check_timeout)
243
205
 
244
- def mark_entry_status(self, entry, ok, error=None, last_ip=None):
206
+ def _mark_entry_status(self, entry, ok, error=None, last_ip=None):
245
207
  """Update health cache for an entry."""
246
208
  self._health[entry] = {
247
209
  "ok": ok,
@@ -250,7 +212,7 @@ class ProxyInterface:
250
212
  "last_ip": last_ip,
251
213
  }
252
214
 
253
- def is_entry_alive(self, entry, timeout=5):
215
+ def _is_entry_alive(self, entry, timeout=5):
254
216
  """Check if a proxy entry is working by making a test request."""
255
217
  host, port, user, pwd = entry
256
218
  try:
@@ -261,10 +223,10 @@ class ProxyInterface:
261
223
  resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=timeout)
262
224
  ok = resp.status_code == 200
263
225
  last_ip = resp.json().get("YourFuckingIPAddress") if ok else None
264
- self.mark_entry_status(entry, ok, last_ip=last_ip)
226
+ self._mark_entry_status(entry, ok, last_ip=last_ip)
265
227
  return ok
266
228
  except Exception as ex:
267
- self.mark_entry_status(entry, False, str(ex))
229
+ self._mark_entry_status(entry, False, str(ex))
268
230
  return False
269
231
 
270
232
  def _get_working_entry(
@@ -272,49 +234,28 @@ class ProxyInterface:
272
234
  use_auth=False,
273
235
  randomize=False,
274
236
  check_timeout=5,
275
- cooldown_seconds=60,
276
- ensure_new_ip=False,
277
- ensure_new_ip_timeout=600,
278
- ensure_new_ip_interval=5,
279
- max_retry_seconds=600,
237
+ cooldown_seconds=30,
238
+ proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
280
239
  ):
281
240
  """Get a working proxy entry, performing health checks as needed."""
282
241
  pool = self._build_pool(use_auth)
283
- candidates = self._get_candidates(pool, randomize)
284
-
285
- # Capture baseline before any health checks that might set it
286
- baseline_before = None
287
- if len(candidates) == 1:
288
- idx = candidates[0]
289
- entry = self.entries[idx]
290
- baseline_before = self._health.get(entry, {}).get("last_ip")
242
+ self._refresh_traversal_queue(pool, randomize)
291
243
 
292
244
  def _find_working_entry():
293
245
  if not self.entries:
294
246
  raise NoWorkingProxiesError("No proxies available")
295
- return self._find_working_entry_once(candidates, check_timeout, cooldown_seconds)
247
+ return self._find_working_entry_once(check_timeout, cooldown_seconds)
296
248
 
297
- if max_retry_seconds > 0:
249
+ # Handle both timedelta and numeric seconds for backward compatibility
250
+ if proxy_rotation_interval:
298
251
  retrying = tenacity.Retrying(
299
- stop=tenacity.stop_after_delay(max_retry_seconds),
252
+ stop=tenacity.stop_after_delay(proxy_rotation_interval),
300
253
  reraise=True,
301
254
  )
302
255
  entry = retrying(_find_working_entry)
303
256
  else:
304
257
  entry = _find_working_entry()
305
258
 
306
- if ensure_new_ip and len(pool) == 1:
307
- baseline = self._health.get(entry, {}).get("last_ip")
308
- if not baseline_before:
309
- # First time seeing this proxy: it was already checked in _find_working_entry, so return immediately
310
- logger.debug("No baseline IP found; returning first-working proxy without waiting for IP change")
311
- else:
312
- # There is a baseline: wait for the IP to change
313
- logger.info(f"ensure_new_ip=True and single proxy, waiting for IP change: {entry[0]}:{entry[1]}")
314
- entry = self._wait_for_new_ip(
315
- entry, baseline, ensure_new_ip_timeout, ensure_new_ip_interval, check_timeout
316
- )
317
-
318
259
  return entry
319
260
 
320
261
  def _get_round_robin_candidates(self, pool):
@@ -334,15 +275,29 @@ class ProxyInterface:
334
275
  pool = self.entries
335
276
  return pool
336
277
 
337
- def _get_candidates(self, pool, randomize):
338
- if randomize:
339
- candidates = [idx for idx, entry in enumerate(self.entries) if entry in pool]
340
- return random.sample(candidates, k=len(candidates))
341
- else:
342
- return self._get_round_robin_candidates(pool)
278
+ def _refresh_traversal_queue(self, pool, randomize):
279
+ # Build current pool indices
280
+ current_pool_indices = [idx for idx, entry in enumerate(self.entries) if entry in pool]
343
281
 
344
- def _find_working_entry_once(self, candidates, check_timeout, cooldown_seconds):
345
- for idx in candidates:
282
+ # Check if we need to refill the traversal queue
283
+ if not self._traversal_queue and current_pool_indices:
284
+ if randomize:
285
+ self._traversal_queue = current_pool_indices.copy()
286
+ random.shuffle(self._traversal_queue)
287
+ else:
288
+ # Round-robin: start from next after current_index
289
+ self._traversal_queue = []
290
+ start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
291
+ for i in range(len(self.entries)):
292
+ idx = (start_idx + i) % len(self.entries)
293
+ if idx in current_pool_indices:
294
+ self._traversal_queue.append(idx)
295
+ self._traversal_start = time.time()
296
+ self._traversal_cycle += 1
297
+
298
+ def _find_working_entry_once(self, check_timeout, cooldown_seconds):
299
+ # Consume from traversal queue for cached checks
300
+ for idx in self._traversal_queue:
346
301
  entry = self.entries[idx]
347
302
  health = self._health.get(entry, {})
348
303
  last_checked = health.get("last_checked", 0)
@@ -352,23 +307,17 @@ class ProxyInterface:
352
307
  if ok and (now - last_checked) < cooldown_seconds:
353
308
  logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
354
309
  self.current_index = idx
310
+ self._traversal_queue.remove(idx)
355
311
  return entry
356
312
  elif not ok and (now - last_checked) < cooldown_seconds:
357
313
  continue
358
314
  else:
359
315
  logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
360
- if self.is_entry_alive(entry, timeout=check_timeout):
316
+ if self._is_entry_alive(entry, timeout=check_timeout):
361
317
  self.current_index = idx
318
+ self._traversal_queue.remove(idx)
362
319
  return entry
363
320
 
364
- logger.warning("No cached working proxies, forcing fresh checks")
365
- for idx in candidates:
366
- entry = self.entries[idx]
367
- logger.debug(f"Force-checking proxy: {entry[0]}:{entry[1]}")
368
- if self.is_entry_alive(entry, timeout=check_timeout):
369
- self.current_index = idx
370
- return entry
371
-
372
321
  raise NoWorkingProxiesError("No working proxies available")
373
322
 
374
323
  def _wait_for_new_ip(self, entry, baseline, timeout, interval, check_timeout):
@@ -386,9 +335,9 @@ class ProxyInterface:
386
335
  current_ip = None
387
336
 
388
337
  if current_ip and current_ip != baseline:
389
- self.mark_entry_status(entry, True, last_ip=current_ip)
338
+ self._mark_entry_status(entry, True, last_ip=current_ip)
390
339
  logger.info(f"IP changed from {baseline} to {current_ip}")
391
- return entry
340
+ return
392
341
 
393
342
  time.sleep(interval)
394
343
 
File without changes
File without changes