datamarket 0.7.104__tar.gz → 0.7.106__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (35) hide show
  1. {datamarket-0.7.104 → datamarket-0.7.106}/PKG-INFO +1 -1
  2. {datamarket-0.7.104 → datamarket-0.7.106}/pyproject.toml +1 -1
  3. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/interfaces/proxy.py +87 -27
  4. {datamarket-0.7.104 → datamarket-0.7.106}/LICENSE +0 -0
  5. {datamarket-0.7.104 → datamarket-0.7.106}/README.md +0 -0
  6. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/__init__.py +0 -0
  7. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/exceptions/__init__.py +0 -0
  8. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/exceptions/main.py +0 -0
  9. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/interfaces/__init__.py +0 -0
  10. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/interfaces/alchemy.py +0 -0
  11. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/interfaces/aws.py +0 -0
  12. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/interfaces/azure.py +0 -0
  13. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/interfaces/drive.py +0 -0
  14. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/interfaces/ftp.py +0 -0
  15. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/interfaces/nominatim.py +0 -0
  16. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/interfaces/peerdb.py +0 -0
  17. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/interfaces/tinybird.py +0 -0
  18. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/params/__init__.py +0 -0
  19. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/params/nominatim.py +0 -0
  20. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/__init__.py +0 -0
  21. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/airflow.py +0 -0
  22. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/alchemy.py +0 -0
  23. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/main.py +0 -0
  24. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/nominatim.py +0 -0
  25. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/playwright/__init__.py +0 -0
  26. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/playwright/async_api.py +0 -0
  27. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/playwright/sync_api.py +0 -0
  28. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/selenium.py +0 -0
  29. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/soda.py +0 -0
  30. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/strings/__init__.py +0 -0
  31. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/strings/normalization.py +0 -0
  32. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/strings/obfuscation.py +0 -0
  33. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/strings/standardization.py +0 -0
  34. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/typer.py +0 -0
  35. {datamarket-0.7.104 → datamarket-0.7.106}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.104
3
+ Version: 0.7.106
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.7.104"
3
+ version = "0.7.106"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -22,7 +22,7 @@ class ProxyInterface:
22
22
 
23
23
  def __init__(self, config):
24
24
  self._load_from_config(config)
25
- self.current_index = random.randrange(len(self.entries)) if self.entries else 0 # noqa: S311
25
+ self.current_index = -2 # -2 means no selection made yet, -1 means Tor selected
26
26
  self._health = {} # {entry: {"ok": bool, "last_checked": time.time(), "last_error": str}}
27
27
 
28
28
  def _load_from_config(self, cfg):
@@ -92,6 +92,7 @@ class ProxyInterface:
92
92
  """
93
93
  # Tor handling (skip health check for tor)
94
94
  if use_tor:
95
+ self.current_index = -1 # Indicate Tor is selected
95
96
  if raw:
96
97
  return ("127.0.0.1", "9050", None, None)
97
98
  return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
@@ -135,15 +136,19 @@ class ProxyInterface:
135
136
  if not pool:
136
137
  pool = self.entries
137
138
 
138
- # Find next in pool using current_index
139
- for _ in range(len(self.entries)):
140
- idx = self.current_index
141
- self.current_index = (self.current_index + 1) % len(self.entries)
139
+ # Start from the next index after current_index
140
+ start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
141
+
142
+ # Find next in pool starting from start_idx
143
+ for i in range(len(self.entries)):
144
+ idx = (start_idx + i) % len(self.entries)
142
145
  entry = self.entries[idx]
143
146
  if entry in pool:
147
+ self.current_index = idx # Update to selected index
144
148
  return entry
145
149
 
146
150
  # Fallback to first entry
151
+ self.current_index = 0
147
152
  return self.entries[0]
148
153
 
149
154
  def get_random(self, use_auth=False):
@@ -156,10 +161,10 @@ class ProxyInterface:
156
161
  pool = self.entries
157
162
 
158
163
  entry = random.choice(pool) # noqa: S311
159
- # Update index to after selected entry for round-robin continuity
164
+ # Update index to selected entry
160
165
  try:
161
166
  pos = self.entries.index(entry)
162
- self.current_index = (pos + 1) % len(self.entries)
167
+ self.current_index = pos
163
168
  except ValueError:
164
169
  pass
165
170
 
@@ -179,17 +184,63 @@ class ProxyInterface:
179
184
  return
180
185
 
181
186
  try:
182
- logger.info(f"Current IP: {self.check_current_ip()}")
187
+ logger.debug(f"Current IP: {self.check_current_ip()}")
183
188
  with Controller.from_port(port=9051) as controller:
184
189
  controller.authenticate(password=self.tor_password)
185
190
  controller.signal(Signal.NEWNYM)
186
191
 
187
192
  time.sleep(5)
188
- logger.info(f"New IP: {self.check_current_ip()}")
193
+ logger.debug(f"New IP: {self.check_current_ip()}")
189
194
  except Exception as ex:
190
195
  logger.error("Failed to renew Tor IP")
191
196
  logger.error(ex)
192
197
 
198
+ def wait_for_new_ip(self, timeout=600, interval=5, check_timeout=5):
199
+ """
200
+ Wait for the IP address of the currently selected proxy to change.
201
+
202
+ :param timeout: Max seconds to wait for IP change
203
+ :param interval: Seconds between IP checks
204
+ :param check_timeout: Timeout for individual IP check requests
205
+ :return: The selected entry (unchanged)
206
+ :raises RuntimeError: If no proxy is available or baseline cannot be determined
207
+ :raises EnsureNewIPTimeoutError: If IP doesn't change within timeout
208
+ """
209
+ # Use currently selected proxy
210
+ if self.current_index == -1:
211
+ # Tor is selected
212
+ entry = ("127.0.0.1", "9050", None, None)
213
+ elif self.current_index >= 0 and self.current_index < len(self.entries):
214
+ # current_index points to the selected entry
215
+ entry = self.entries[self.current_index]
216
+ else:
217
+ # No valid selection, select one
218
+ logger.debug("No proxy currently selected, selecting one for IP waiting")
219
+ self.get_proxies(raw=True)
220
+ if self.current_index == -1:
221
+ entry = ("127.0.0.1", "9050", None, None)
222
+ elif self.current_index >= 0 and self.current_index < len(self.entries):
223
+ entry = self.entries[self.current_index]
224
+ else:
225
+ raise RuntimeError("Could not select a proxy for IP waiting")
226
+
227
+ # Auto-detect baseline IP
228
+ host, port, user, pwd = entry
229
+ proxies_map = {
230
+ "http": self.get_proxy_url(host, port, user, pwd, "http"),
231
+ "https": self.get_proxy_url(host, port, user, pwd, "http"),
232
+ }
233
+ try:
234
+ resp = requests.get(self.CHECK_IP_URL, proxies=proxies_map, timeout=check_timeout)
235
+ baseline = resp.json().get("YourFuckingIPAddress")
236
+ if not baseline:
237
+ raise RuntimeError(f"Could not determine baseline IP for entry {host}:{port}")
238
+ logger.debug(f"Auto-detected baseline IP: {baseline}")
239
+ except Exception as ex:
240
+ raise RuntimeError(f"Could not determine baseline IP for entry {host}:{port}: {ex}") from ex
241
+
242
+ return self._wait_for_new_ip(entry, baseline, timeout, interval, check_timeout)
243
+
193
244
  def mark_entry_status(self, entry, ok, error=None, last_ip=None):
194
245
  """Update health cache for an entry."""
195
246
  self._health[entry] = {
@@ -231,6 +282,13 @@ class ProxyInterface:
231
282
  pool = self._build_pool(use_auth)
232
283
  candidates = self._get_candidates(pool, randomize)
233
284
 
285
+ # Capture baseline before any health checks that might set it
286
+ baseline_before = None
287
+ if len(candidates) == 1:
288
+ idx = candidates[0]
289
+ entry = self.entries[idx]
290
+ baseline_before = self._health.get(entry, {}).get("last_ip")
291
+
234
292
  def _find_working_entry():
235
293
  if not self.entries:
236
294
  raise NoWorkingProxiesError("No proxies available")
@@ -246,32 +304,28 @@ class ProxyInterface:
246
304
  entry = _find_working_entry()
247
305
 
248
306
  if ensure_new_ip and len(pool) == 1:
249
- logger.info(f"ensure_new_ip=True and single proxy, waiting for IP change: {entry[0]}:{entry[1]}")
250
307
  baseline = self._health.get(entry, {}).get("last_ip")
251
- if not baseline:
252
- if not self.is_entry_alive(entry, timeout=check_timeout):
253
- raise NoWorkingProxiesError("Proxy became unavailable during ensure_new_ip")
254
- baseline = self._health.get(entry, {}).get("last_ip")
255
- entry = self._wait_for_new_ip(entry, baseline, ensure_new_ip_timeout, ensure_new_ip_interval, check_timeout)
308
+ if not baseline_before:
309
+ # First time seeing this proxy: it was already checked in _find_working_entry, so return immediately
310
+ logger.debug("No baseline IP found; returning first-working proxy without waiting for IP change")
311
+ else:
312
+ # There is a baseline: wait for the IP to change
313
+ logger.info(f"ensure_new_ip=True and single proxy, waiting for IP change: {entry[0]}:{entry[1]}")
314
+ entry = self._wait_for_new_ip(
315
+ entry, baseline, ensure_new_ip_timeout, ensure_new_ip_interval, check_timeout
316
+ )
256
317
 
257
318
  return entry
258
319
 
259
320
  def _get_round_robin_candidates(self, pool):
260
321
  """Get candidates in round-robin order starting from current_index."""
261
322
  candidates = []
262
- start_idx = self.current_index
323
+ start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
263
324
  for i in range(len(self.entries)):
264
325
  idx = (start_idx + i) % len(self.entries)
265
326
  entry = self.entries[idx]
266
327
  if entry in pool:
267
- candidates.append(entry)
268
- # Update current_index for next call
269
- if candidates:
270
- try:
271
- pos = self.entries.index(candidates[0])
272
- self.current_index = (pos + 1) % len(self.entries)
273
- except ValueError:
274
- pass
328
+ candidates.append(idx)
275
329
  return candidates
276
330
 
277
331
  def _build_pool(self, use_auth):
@@ -282,12 +336,14 @@ class ProxyInterface:
282
336
 
283
337
  def _get_candidates(self, pool, randomize):
284
338
  if randomize:
285
- return pool[:]
339
+ candidates = [idx for idx, entry in enumerate(self.entries) if entry in pool]
340
+ return random.sample(candidates, k=len(candidates))
286
341
  else:
287
342
  return self._get_round_robin_candidates(pool)
288
343
 
289
344
  def _find_working_entry_once(self, candidates, check_timeout, cooldown_seconds):
290
- for entry in candidates:
345
+ for idx in candidates:
346
+ entry = self.entries[idx]
291
347
  health = self._health.get(entry, {})
292
348
  last_checked = health.get("last_checked", 0)
293
349
  ok = health.get("ok", False)
@@ -295,18 +351,22 @@ class ProxyInterface:
295
351
 
296
352
  if ok and (now - last_checked) < cooldown_seconds:
297
353
  logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
354
+ self.current_index = idx
298
355
  return entry
299
356
  elif not ok and (now - last_checked) < cooldown_seconds:
300
357
  continue
301
358
  else:
302
359
  logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
303
360
  if self.is_entry_alive(entry, timeout=check_timeout):
361
+ self.current_index = idx
304
362
  return entry
305
363
 
306
364
  logger.warning("No cached working proxies, forcing fresh checks")
307
- for entry in candidates:
365
+ for idx in candidates:
366
+ entry = self.entries[idx]
308
367
  logger.debug(f"Force-checking proxy: {entry[0]}:{entry[1]}")
309
368
  if self.is_entry_alive(entry, timeout=check_timeout):
369
+ self.current_index = idx
310
370
  return entry
311
371
 
312
372
  raise NoWorkingProxiesError("No working proxies available")
File without changes
File without changes