datamarket 0.7.106__py3-none-any.whl → 0.7.107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/interfaces/proxy.py +84 -135
- {datamarket-0.7.106.dist-info → datamarket-0.7.107.dist-info}/METADATA +1 -1
- {datamarket-0.7.106.dist-info → datamarket-0.7.107.dist-info}/RECORD +5 -5
- {datamarket-0.7.106.dist-info → datamarket-0.7.107.dist-info}/LICENSE +0 -0
- {datamarket-0.7.106.dist-info → datamarket-0.7.107.dist-info}/WHEEL +0 -0
datamarket/interfaces/proxy.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import random
|
|
3
3
|
import time
|
|
4
|
+
from datetime import timedelta
|
|
4
5
|
|
|
5
6
|
import requests
|
|
6
7
|
import tenacity
|
|
@@ -12,6 +13,9 @@ from datamarket.exceptions import EnsureNewIPTimeoutError, NoWorkingProxiesError
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
13
14
|
logging.getLogger("stem").setLevel(logging.WARNING)
|
|
14
15
|
|
|
16
|
+
PROXY_ROTATION_INTERVAL = timedelta(minutes=10)
|
|
17
|
+
PROXY_ROTATION_TIMEOUT_SECONDS = int(PROXY_ROTATION_INTERVAL.total_seconds())
|
|
18
|
+
|
|
15
19
|
|
|
16
20
|
class ProxyInterface:
|
|
17
21
|
"""
|
|
@@ -24,6 +28,10 @@ class ProxyInterface:
|
|
|
24
28
|
self._load_from_config(config)
|
|
25
29
|
self.current_index = -2 # -2 means no selection made yet, -1 means Tor selected
|
|
26
30
|
self._health = {} # {entry: {"ok": bool, "last_checked": time.time(), "last_error": str}}
|
|
31
|
+
self._traversal_queue = [] # Queue of indices left to test in current traversal
|
|
32
|
+
self._traversal_start = None # Timestamp when current traversal started
|
|
33
|
+
self._last_ip_wait = {} # {entry: (timestamp, traversal_cycle)} - last time we attempted to wait for IP change
|
|
34
|
+
self._traversal_cycle = 0 # Counter of full traversals of the queue
|
|
27
35
|
|
|
28
36
|
def _load_from_config(self, cfg):
|
|
29
37
|
# Tor password (optional)
|
|
@@ -67,13 +75,9 @@ class ProxyInterface:
|
|
|
67
75
|
raw=False,
|
|
68
76
|
use_auth=False,
|
|
69
77
|
use_socks=False,
|
|
70
|
-
health_check=True,
|
|
71
78
|
check_timeout=5,
|
|
72
|
-
cooldown_seconds=
|
|
73
|
-
|
|
74
|
-
ensure_new_ip_timeout=600,
|
|
75
|
-
ensure_new_ip_interval=5,
|
|
76
|
-
max_retry_seconds=600,
|
|
79
|
+
cooldown_seconds=30,
|
|
80
|
+
proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
|
|
77
81
|
):
|
|
78
82
|
"""
|
|
79
83
|
Return parsed proxy URLs or raw entry tuple for a working proxy.
|
|
@@ -82,13 +86,9 @@ class ProxyInterface:
|
|
|
82
86
|
:param randomize: select a random proxy if True, otherwise round-robin
|
|
83
87
|
:param raw: return raw (host, port, user, password) tuple if True
|
|
84
88
|
:param use_auth: include proxies that require authentication if True; otherwise only credential-free
|
|
85
|
-
:param health_check: perform health checks to ensure proxy is working if True
|
|
86
89
|
:param check_timeout: timeout in seconds for health check requests
|
|
87
90
|
:param cooldown_seconds: how long to cache health status before re-checking
|
|
88
|
-
:param
|
|
89
|
-
:param ensure_new_ip_timeout: max seconds to wait for IP change when ensure_new_ip=True
|
|
90
|
-
:param ensure_new_ip_interval: seconds between IP checks when ensure_new_ip=True
|
|
91
|
-
:param max_retry_seconds: max seconds to retry finding working proxies (0 to disable)
|
|
91
|
+
:param proxy_rotation_interval: max time to retry finding working proxies (timedelta or seconds, 0 to disable)
|
|
92
92
|
"""
|
|
93
93
|
# Tor handling (skip health check for tor)
|
|
94
94
|
if use_tor:
|
|
@@ -98,20 +98,13 @@ class ProxyInterface:
|
|
|
98
98
|
return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
|
|
99
99
|
|
|
100
100
|
# Get a working entry (with health checks if enabled)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
ensure_new_ip_timeout=ensure_new_ip_timeout,
|
|
109
|
-
ensure_new_ip_interval=ensure_new_ip_interval,
|
|
110
|
-
max_retry_seconds=max_retry_seconds,
|
|
111
|
-
)
|
|
112
|
-
else:
|
|
113
|
-
# Legacy behavior: no health check
|
|
114
|
-
host, port, user, password = self.get_random(use_auth) if randomize else self.get_next(use_auth)
|
|
101
|
+
host, port, user, password = self._get_working_entry(
|
|
102
|
+
use_auth=use_auth,
|
|
103
|
+
randomize=randomize,
|
|
104
|
+
check_timeout=check_timeout,
|
|
105
|
+
cooldown_seconds=cooldown_seconds,
|
|
106
|
+
proxy_rotation_interval=proxy_rotation_interval,
|
|
107
|
+
)
|
|
115
108
|
|
|
116
109
|
if raw:
|
|
117
110
|
return host, port, user, password
|
|
@@ -127,49 +120,6 @@ class ProxyInterface:
|
|
|
127
120
|
"https": self.get_proxy_url(host, port, user, password, "http"),
|
|
128
121
|
}
|
|
129
122
|
|
|
130
|
-
def get_next(self, use_auth=False):
|
|
131
|
-
# Round-robin selection, optionally filtering out authenticated proxies
|
|
132
|
-
if not self.entries:
|
|
133
|
-
raise NoWorkingProxiesError("No proxies available")
|
|
134
|
-
|
|
135
|
-
pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
|
|
136
|
-
if not pool:
|
|
137
|
-
pool = self.entries
|
|
138
|
-
|
|
139
|
-
# Start from the next index after current_index
|
|
140
|
-
start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
|
|
141
|
-
|
|
142
|
-
# Find next in pool starting from start_idx
|
|
143
|
-
for i in range(len(self.entries)):
|
|
144
|
-
idx = (start_idx + i) % len(self.entries)
|
|
145
|
-
entry = self.entries[idx]
|
|
146
|
-
if entry in pool:
|
|
147
|
-
self.current_index = idx # Update to selected index
|
|
148
|
-
return entry
|
|
149
|
-
|
|
150
|
-
# Fallback to first entry
|
|
151
|
-
self.current_index = 0
|
|
152
|
-
return self.entries[0]
|
|
153
|
-
|
|
154
|
-
def get_random(self, use_auth=False):
|
|
155
|
-
# Random selection, optionally filtering out authenticated proxies
|
|
156
|
-
if not self.entries:
|
|
157
|
-
raise NoWorkingProxiesError("No proxies available")
|
|
158
|
-
|
|
159
|
-
pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
|
|
160
|
-
if not pool:
|
|
161
|
-
pool = self.entries
|
|
162
|
-
|
|
163
|
-
entry = random.choice(pool) # noqa: S311
|
|
164
|
-
# Update index to selected entry
|
|
165
|
-
try:
|
|
166
|
-
pos = self.entries.index(entry)
|
|
167
|
-
self.current_index = pos
|
|
168
|
-
except ValueError:
|
|
169
|
-
pass
|
|
170
|
-
|
|
171
|
-
return entry
|
|
172
|
-
|
|
173
123
|
def check_current_ip(self, proxies=None):
|
|
174
124
|
try:
|
|
175
125
|
proxies_arg = proxies or {"http": self.proxies["http"]}
|
|
@@ -195,9 +145,9 @@ class ProxyInterface:
|
|
|
195
145
|
logger.error("Failed to renew Tor IP")
|
|
196
146
|
logger.error(ex)
|
|
197
147
|
|
|
198
|
-
def wait_for_new_ip(self, timeout=
|
|
148
|
+
def wait_for_new_ip(self, timeout=PROXY_ROTATION_TIMEOUT_SECONDS, interval=30, check_timeout=5):
|
|
199
149
|
"""
|
|
200
|
-
|
|
150
|
+
Ensures that the IP address of the selected proxy differs from any other proxy chosen within the proxy rotation interval.
|
|
201
151
|
|
|
202
152
|
:param timeout: Max seconds to wait for IP change
|
|
203
153
|
:param interval: Seconds between IP checks
|
|
@@ -224,24 +174,36 @@ class ProxyInterface:
|
|
|
224
174
|
else:
|
|
225
175
|
raise RuntimeError("Could not select a proxy for IP waiting")
|
|
226
176
|
|
|
227
|
-
#
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
if
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
177
|
+
# Check if we should skip waiting based on global cooldown and traversal cycle
|
|
178
|
+
now = time.time()
|
|
179
|
+
interval_seconds = PROXY_ROTATION_INTERVAL.total_seconds()
|
|
180
|
+
for last_wait in self._last_ip_wait.values():
|
|
181
|
+
last_ts, last_cycle = last_wait
|
|
182
|
+
|
|
183
|
+
time_recent = last_ts is not None and (now - last_ts) <= interval_seconds
|
|
184
|
+
no_full_rotation = self._traversal_cycle <= last_cycle
|
|
185
|
+
|
|
186
|
+
# Skip only if both conditions are true: recent wait AND no full traversal cycle since
|
|
187
|
+
if time_recent and no_full_rotation:
|
|
188
|
+
logger.debug(
|
|
189
|
+
"Skipping wait_for_new_ip: last wait %.1fs ago and no full traversal since (last_cycle=%s current=%s)",
|
|
190
|
+
now - last_ts,
|
|
191
|
+
last_cycle,
|
|
192
|
+
self._traversal_cycle,
|
|
193
|
+
)
|
|
194
|
+
return
|
|
241
195
|
|
|
196
|
+
# Mark we are now attempting to wait for this entry
|
|
197
|
+
self._last_ip_wait[entry] = (now, self._traversal_cycle)
|
|
198
|
+
|
|
199
|
+
# Try to use cached baseline IP from health check
|
|
200
|
+
health = self._health.get(entry, {})
|
|
201
|
+
baseline = health.get("last_ip")
|
|
202
|
+
if baseline is None:
|
|
203
|
+
raise RuntimeError(f"Could not determine baseline IP for entry {entry[0]}:{entry[1]}")
|
|
242
204
|
return self._wait_for_new_ip(entry, baseline, timeout, interval, check_timeout)
|
|
243
205
|
|
|
244
|
-
def
|
|
206
|
+
def _mark_entry_status(self, entry, ok, error=None, last_ip=None):
|
|
245
207
|
"""Update health cache for an entry."""
|
|
246
208
|
self._health[entry] = {
|
|
247
209
|
"ok": ok,
|
|
@@ -250,7 +212,7 @@ class ProxyInterface:
|
|
|
250
212
|
"last_ip": last_ip,
|
|
251
213
|
}
|
|
252
214
|
|
|
253
|
-
def
|
|
215
|
+
def _is_entry_alive(self, entry, timeout=5):
|
|
254
216
|
"""Check if a proxy entry is working by making a test request."""
|
|
255
217
|
host, port, user, pwd = entry
|
|
256
218
|
try:
|
|
@@ -261,10 +223,10 @@ class ProxyInterface:
|
|
|
261
223
|
resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=timeout)
|
|
262
224
|
ok = resp.status_code == 200
|
|
263
225
|
last_ip = resp.json().get("YourFuckingIPAddress") if ok else None
|
|
264
|
-
self.
|
|
226
|
+
self._mark_entry_status(entry, ok, last_ip=last_ip)
|
|
265
227
|
return ok
|
|
266
228
|
except Exception as ex:
|
|
267
|
-
self.
|
|
229
|
+
self._mark_entry_status(entry, False, str(ex))
|
|
268
230
|
return False
|
|
269
231
|
|
|
270
232
|
def _get_working_entry(
|
|
@@ -272,49 +234,28 @@ class ProxyInterface:
|
|
|
272
234
|
use_auth=False,
|
|
273
235
|
randomize=False,
|
|
274
236
|
check_timeout=5,
|
|
275
|
-
cooldown_seconds=
|
|
276
|
-
|
|
277
|
-
ensure_new_ip_timeout=600,
|
|
278
|
-
ensure_new_ip_interval=5,
|
|
279
|
-
max_retry_seconds=600,
|
|
237
|
+
cooldown_seconds=30,
|
|
238
|
+
proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
|
|
280
239
|
):
|
|
281
240
|
"""Get a working proxy entry, performing health checks as needed."""
|
|
282
241
|
pool = self._build_pool(use_auth)
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
# Capture baseline before any health checks that might set it
|
|
286
|
-
baseline_before = None
|
|
287
|
-
if len(candidates) == 1:
|
|
288
|
-
idx = candidates[0]
|
|
289
|
-
entry = self.entries[idx]
|
|
290
|
-
baseline_before = self._health.get(entry, {}).get("last_ip")
|
|
242
|
+
self._refresh_traversal_queue(pool, randomize)
|
|
291
243
|
|
|
292
244
|
def _find_working_entry():
|
|
293
245
|
if not self.entries:
|
|
294
246
|
raise NoWorkingProxiesError("No proxies available")
|
|
295
|
-
return self._find_working_entry_once(
|
|
247
|
+
return self._find_working_entry_once(check_timeout, cooldown_seconds)
|
|
296
248
|
|
|
297
|
-
|
|
249
|
+
# Handle both timedelta and numeric seconds for backward compatibility
|
|
250
|
+
if proxy_rotation_interval:
|
|
298
251
|
retrying = tenacity.Retrying(
|
|
299
|
-
stop=tenacity.stop_after_delay(
|
|
252
|
+
stop=tenacity.stop_after_delay(proxy_rotation_interval),
|
|
300
253
|
reraise=True,
|
|
301
254
|
)
|
|
302
255
|
entry = retrying(_find_working_entry)
|
|
303
256
|
else:
|
|
304
257
|
entry = _find_working_entry()
|
|
305
258
|
|
|
306
|
-
if ensure_new_ip and len(pool) == 1:
|
|
307
|
-
baseline = self._health.get(entry, {}).get("last_ip")
|
|
308
|
-
if not baseline_before:
|
|
309
|
-
# First time seeing this proxy: it was already checked in _find_working_entry, so return immediately
|
|
310
|
-
logger.debug("No baseline IP found; returning first-working proxy without waiting for IP change")
|
|
311
|
-
else:
|
|
312
|
-
# There is a baseline: wait for the IP to change
|
|
313
|
-
logger.info(f"ensure_new_ip=True and single proxy, waiting for IP change: {entry[0]}:{entry[1]}")
|
|
314
|
-
entry = self._wait_for_new_ip(
|
|
315
|
-
entry, baseline, ensure_new_ip_timeout, ensure_new_ip_interval, check_timeout
|
|
316
|
-
)
|
|
317
|
-
|
|
318
259
|
return entry
|
|
319
260
|
|
|
320
261
|
def _get_round_robin_candidates(self, pool):
|
|
@@ -334,15 +275,29 @@ class ProxyInterface:
|
|
|
334
275
|
pool = self.entries
|
|
335
276
|
return pool
|
|
336
277
|
|
|
337
|
-
def
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
return random.sample(candidates, k=len(candidates))
|
|
341
|
-
else:
|
|
342
|
-
return self._get_round_robin_candidates(pool)
|
|
278
|
+
def _refresh_traversal_queue(self, pool, randomize):
|
|
279
|
+
# Build current pool indices
|
|
280
|
+
current_pool_indices = [idx for idx, entry in enumerate(self.entries) if entry in pool]
|
|
343
281
|
|
|
344
|
-
|
|
345
|
-
|
|
282
|
+
# Check if we need to refill the traversal queue
|
|
283
|
+
if not self._traversal_queue and current_pool_indices:
|
|
284
|
+
if randomize:
|
|
285
|
+
self._traversal_queue = current_pool_indices.copy()
|
|
286
|
+
random.shuffle(self._traversal_queue)
|
|
287
|
+
else:
|
|
288
|
+
# Round-robin: start from next after current_index
|
|
289
|
+
self._traversal_queue = []
|
|
290
|
+
start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
|
|
291
|
+
for i in range(len(self.entries)):
|
|
292
|
+
idx = (start_idx + i) % len(self.entries)
|
|
293
|
+
if idx in current_pool_indices:
|
|
294
|
+
self._traversal_queue.append(idx)
|
|
295
|
+
self._traversal_start = time.time()
|
|
296
|
+
self._traversal_cycle += 1
|
|
297
|
+
|
|
298
|
+
def _find_working_entry_once(self, check_timeout, cooldown_seconds):
|
|
299
|
+
# Consume from traversal queue for cached checks
|
|
300
|
+
for idx in self._traversal_queue:
|
|
346
301
|
entry = self.entries[idx]
|
|
347
302
|
health = self._health.get(entry, {})
|
|
348
303
|
last_checked = health.get("last_checked", 0)
|
|
@@ -352,23 +307,17 @@ class ProxyInterface:
|
|
|
352
307
|
if ok and (now - last_checked) < cooldown_seconds:
|
|
353
308
|
logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
|
|
354
309
|
self.current_index = idx
|
|
310
|
+
self._traversal_queue.remove(idx)
|
|
355
311
|
return entry
|
|
356
312
|
elif not ok and (now - last_checked) < cooldown_seconds:
|
|
357
313
|
continue
|
|
358
314
|
else:
|
|
359
315
|
logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
|
|
360
|
-
if self.
|
|
316
|
+
if self._is_entry_alive(entry, timeout=check_timeout):
|
|
361
317
|
self.current_index = idx
|
|
318
|
+
self._traversal_queue.remove(idx)
|
|
362
319
|
return entry
|
|
363
320
|
|
|
364
|
-
logger.warning("No cached working proxies, forcing fresh checks")
|
|
365
|
-
for idx in candidates:
|
|
366
|
-
entry = self.entries[idx]
|
|
367
|
-
logger.debug(f"Force-checking proxy: {entry[0]}:{entry[1]}")
|
|
368
|
-
if self.is_entry_alive(entry, timeout=check_timeout):
|
|
369
|
-
self.current_index = idx
|
|
370
|
-
return entry
|
|
371
|
-
|
|
372
321
|
raise NoWorkingProxiesError("No working proxies available")
|
|
373
322
|
|
|
374
323
|
def _wait_for_new_ip(self, entry, baseline, timeout, interval, check_timeout):
|
|
@@ -386,9 +335,9 @@ class ProxyInterface:
|
|
|
386
335
|
current_ip = None
|
|
387
336
|
|
|
388
337
|
if current_ip and current_ip != baseline:
|
|
389
|
-
self.
|
|
338
|
+
self._mark_entry_status(entry, True, last_ip=current_ip)
|
|
390
339
|
logger.info(f"IP changed from {baseline} to {current_ip}")
|
|
391
|
-
return
|
|
340
|
+
return
|
|
392
341
|
|
|
393
342
|
time.sleep(interval)
|
|
394
343
|
|
|
@@ -9,7 +9,7 @@ datamarket/interfaces/drive.py,sha256=3nhx3THr2SHNWKYwme9F2nPpvsqyEMFIxz0whF2FjH
|
|
|
9
9
|
datamarket/interfaces/ftp.py,sha256=LH3Oz19k_xUNhzDXcrq5Ofb4c3uiph5pWUqpgiaDvHI,2671
|
|
10
10
|
datamarket/interfaces/nominatim.py,sha256=xizT94tVum7QPppfDgI5sEhx1mAXT-SM3JyPl8CDxxU,15148
|
|
11
11
|
datamarket/interfaces/peerdb.py,sha256=sO451wEGNb_0DDwchZ6eBVYKltqHM5XKau-WsfspXzA,23640
|
|
12
|
-
datamarket/interfaces/proxy.py,sha256=
|
|
12
|
+
datamarket/interfaces/proxy.py,sha256=Hxwmkii3wP3oplg4yMhr_NF6ru8tmz7z8jHld4SfPRw,14325
|
|
13
13
|
datamarket/interfaces/tinybird.py,sha256=cNG-kAPTdQn2inlNX9LPf-VVdtnLud947ApLVO40Now,2594
|
|
14
14
|
datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
15
|
datamarket/params/nominatim.py,sha256=S9TEB4FxmffvFyK9KffWl20TfXzWX69IAdbEehKar1I,11920
|
|
@@ -29,7 +29,7 @@ datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnm
|
|
|
29
29
|
datamarket/utils/strings/standardization.py,sha256=j_NbT-O1XnxDvDhct8panfkrfAC8R5OX6XM5fYBZ4RU,1496
|
|
30
30
|
datamarket/utils/typer.py,sha256=geWuwMwGQjBQhxo27hX0vEAeRl1j1TS0u2oFVfpAs5I,816
|
|
31
31
|
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
32
|
-
datamarket-0.7.
|
|
33
|
-
datamarket-0.7.
|
|
34
|
-
datamarket-0.7.
|
|
35
|
-
datamarket-0.7.
|
|
32
|
+
datamarket-0.7.107.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
33
|
+
datamarket-0.7.107.dist-info/METADATA,sha256=F0zxym6rN2EWf3jh_1ilKdrIXMqdd_D4py1MzuYzuLA,7382
|
|
34
|
+
datamarket-0.7.107.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
35
|
+
datamarket-0.7.107.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|