datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/__init__.py +0 -1
- datamarket/exceptions/__init__.py +1 -0
- datamarket/exceptions/main.py +118 -0
- datamarket/interfaces/alchemy.py +1934 -25
- datamarket/interfaces/aws.py +81 -14
- datamarket/interfaces/azure.py +127 -0
- datamarket/interfaces/drive.py +60 -10
- datamarket/interfaces/ftp.py +37 -14
- datamarket/interfaces/llm.py +1220 -0
- datamarket/interfaces/nominatim.py +314 -42
- datamarket/interfaces/peerdb.py +272 -104
- datamarket/interfaces/proxy.py +354 -50
- datamarket/interfaces/tinybird.py +7 -15
- datamarket/params/nominatim.py +439 -0
- datamarket/utils/__init__.py +1 -1
- datamarket/utils/airflow.py +10 -7
- datamarket/utils/alchemy.py +2 -1
- datamarket/utils/logs.py +88 -0
- datamarket/utils/main.py +138 -10
- datamarket/utils/nominatim.py +201 -0
- datamarket/utils/playwright/__init__.py +0 -0
- datamarket/utils/playwright/async_api.py +274 -0
- datamarket/utils/playwright/sync_api.py +281 -0
- datamarket/utils/requests.py +655 -0
- datamarket/utils/selenium.py +6 -12
- datamarket/utils/strings/__init__.py +1 -0
- datamarket/utils/strings/normalization.py +217 -0
- datamarket/utils/strings/obfuscation.py +153 -0
- datamarket/utils/strings/standardization.py +40 -0
- datamarket/utils/typer.py +2 -1
- datamarket/utils/types.py +1 -0
- datamarket-0.10.3.dist-info/METADATA +172 -0
- datamarket-0.10.3.dist-info/RECORD +38 -0
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
- datamarket-0.6.0.dist-info/METADATA +0 -49
- datamarket-0.6.0.dist-info/RECORD +0 -24
- datamarket-0.6.0.dist-info/top_level.txt +0 -1
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
datamarket/interfaces/proxy.py
CHANGED
|
@@ -2,92 +2,396 @@
|
|
|
2
2
|
# IMPORTS
|
|
3
3
|
|
|
4
4
|
import logging
|
|
5
|
-
import time
|
|
6
5
|
import random
|
|
6
|
+
import time
|
|
7
|
+
from datetime import timedelta
|
|
8
|
+
from functools import partial
|
|
7
9
|
|
|
8
10
|
import requests
|
|
11
|
+
import tenacity
|
|
9
12
|
from stem import Signal
|
|
10
13
|
from stem.control import Controller
|
|
11
14
|
|
|
15
|
+
from datamarket.exceptions import EnsureNewIPTimeoutError, NoWorkingProxiesError
|
|
16
|
+
|
|
12
17
|
########################################################################################################################
|
|
13
|
-
#
|
|
18
|
+
# SETUP
|
|
14
19
|
|
|
15
20
|
logger = logging.getLogger(__name__)
|
|
16
21
|
logging.getLogger("stem").setLevel(logging.WARNING)
|
|
17
22
|
|
|
23
|
+
PROXY_ROTATION_INTERVAL = timedelta(minutes=10)
|
|
24
|
+
PROXY_ROTATION_TIMEOUT_SECONDS = int(PROXY_ROTATION_INTERVAL.total_seconds())
|
|
25
|
+
|
|
26
|
+
########################################################################################################################
|
|
27
|
+
# CLASSES
|
|
28
|
+
|
|
18
29
|
|
|
19
30
|
class ProxyInterface:
|
|
31
|
+
"""
|
|
32
|
+
Manage HTTP, HTTPS, and SOCKS5 proxies configured in the [proxy] section.
|
|
33
|
+
"""
|
|
34
|
+
|
|
20
35
|
CHECK_IP_URL = "https://wtfismyip.com/json"
|
|
21
36
|
|
|
22
37
|
def __init__(self, config):
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
38
|
+
"""
|
|
39
|
+
Initialize the ProxyInterface with configuration.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
config: Configuration object with proxy settings in the [proxy] section.
|
|
43
|
+
Expected to have 'hosts' and optionally 'tor_password' settings.
|
|
44
|
+
"""
|
|
45
|
+
self._load_from_config(config)
|
|
46
|
+
self.current_index = -2 # -2: None, -1: Tor, >=0: Index in entries
|
|
47
|
+
self._health = {} # {entry: {"ok": bool, "last_checked": ts, "last_error": str}}
|
|
48
|
+
self._traversal_queue = []
|
|
49
|
+
self._traversal_start = None
|
|
50
|
+
self._last_ip_wait = {}
|
|
51
|
+
self._traversal_cycle = 0
|
|
52
|
+
self._automatic_rotation = True
|
|
53
|
+
self._pool = []
|
|
54
|
+
|
|
55
|
+
def _load_from_config(self, cfg):
|
|
56
|
+
"""
|
|
57
|
+
Load proxy configuration from config object.
|
|
58
|
+
"""
|
|
59
|
+
self.tor_password = cfg.get("proxy", "tor_password", fallback=None)
|
|
60
|
+
hosts_raw = cfg.get("proxy", "hosts", fallback="")
|
|
61
|
+
|
|
62
|
+
if not hosts_raw:
|
|
63
|
+
raise RuntimeError("[proxy] hosts list is empty")
|
|
64
|
+
|
|
65
|
+
entries = []
|
|
66
|
+
for host_entry in (h.strip() for h in hosts_raw.split(",") if h.strip()):
|
|
67
|
+
host, port, user, password = self._parse_host_entry(host_entry)
|
|
68
|
+
entries.append((host, port, user, password))
|
|
69
|
+
|
|
70
|
+
self.entries = entries
|
|
71
|
+
|
|
72
|
+
def _parse_host_entry(self, host_entry):
|
|
73
|
+
"""
|
|
74
|
+
Parse a host entry string into components.
|
|
75
|
+
"""
|
|
76
|
+
if "@" in host_entry:
|
|
77
|
+
auth_part, host_part = host_entry.rsplit("@", 1)
|
|
78
|
+
host, port = host_part.split(":")
|
|
79
|
+
user, password = auth_part.split(":", 1)
|
|
80
|
+
return host, port, user, password
|
|
81
|
+
return *host_entry.split(":"), None, None
|
|
28
82
|
|
|
29
83
|
@property
|
|
30
84
|
def proxies(self):
|
|
31
|
-
|
|
85
|
+
"""
|
|
86
|
+
Get current proxies using Tor if configured, otherwise standard proxies.
|
|
87
|
+
"""
|
|
88
|
+
return self.get_proxies(use_tor=bool(self.tor_password))
|
|
32
89
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
90
|
+
def set_automatic_rotation(self, enable=True):
|
|
91
|
+
"""Configures automatic proxy rotation on each request."""
|
|
92
|
+
self._automatic_rotation = enable
|
|
36
93
|
|
|
37
|
-
|
|
38
|
-
|
|
94
|
+
def rotate_proxies(self, randomize=False, use_auth=False):
|
|
95
|
+
"""
|
|
96
|
+
Manually rotate to the next proxy in the pool.
|
|
97
|
+
"""
|
|
98
|
+
if not self.entries:
|
|
99
|
+
logger.warning("No proxy entries available to rotate")
|
|
100
|
+
return
|
|
39
101
|
|
|
40
|
-
|
|
41
|
-
return proxy_url
|
|
102
|
+
self._pool = self._build_pool(use_auth)
|
|
42
103
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
104
|
+
self._refresh_traversal_queue(self._pool, randomize)
|
|
105
|
+
|
|
106
|
+
if self._traversal_queue:
|
|
107
|
+
next_index = self._traversal_queue[0]
|
|
108
|
+
self.current_index = next_index
|
|
109
|
+
self._traversal_queue.pop(0)
|
|
110
|
+
entry = self.entries[next_index]
|
|
111
|
+
logger.info(f"Rotated to proxy: {entry[0]}:{entry[1]} (index {next_index})")
|
|
46
112
|
else:
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
user = self.config.get("user")
|
|
50
|
-
password = self.config.get("password")
|
|
51
|
-
use_socks = self.config.get("socks", "false").lower() == "true"
|
|
113
|
+
logger.warning("Traversal queue is empty, cannot rotate")
|
|
52
114
|
|
|
53
|
-
|
|
115
|
+
@staticmethod
|
|
116
|
+
def get_proxy_url(host, port, user=None, password=None, schema="http"):
|
|
117
|
+
"""
|
|
118
|
+
Build a proxy URL from components.
|
|
119
|
+
"""
|
|
120
|
+
auth = f"{user}:{password}@" if user and password else ""
|
|
121
|
+
return f"{schema}://{auth}{host}:{port}"
|
|
54
122
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
123
|
+
def _get_proxies_dict_from_entry(self, entry, schema="http"):
|
|
124
|
+
"""
|
|
125
|
+
Build a proxy dictionary from an entry tuple.
|
|
126
|
+
"""
|
|
127
|
+
host, port, user, pwd = entry
|
|
128
|
+
if schema == "socks5":
|
|
129
|
+
return {"socks5": self.get_proxy_url(host, port, user, pwd, "socks5")}
|
|
130
|
+
|
|
131
|
+
url = self.get_proxy_url(host, port, user, pwd, "http")
|
|
132
|
+
return {"http": url, "https": url}
|
|
133
|
+
|
|
134
|
+
def get_proxies(
|
|
135
|
+
self,
|
|
136
|
+
use_tor=False,
|
|
137
|
+
randomize=False,
|
|
138
|
+
raw=False,
|
|
139
|
+
use_auth=False,
|
|
140
|
+
use_socks=False,
|
|
141
|
+
check_timeout=5,
|
|
142
|
+
cooldown_seconds=30,
|
|
143
|
+
proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
|
|
144
|
+
):
|
|
145
|
+
"""
|
|
146
|
+
Get a working proxy with rotation and health checking.
|
|
147
|
+
"""
|
|
148
|
+
# Tor handling
|
|
149
|
+
if use_tor:
|
|
150
|
+
self.current_index = -1
|
|
151
|
+
if raw:
|
|
152
|
+
return ("127.0.0.1", "9050", None, None)
|
|
153
|
+
return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
|
|
154
|
+
|
|
155
|
+
# Standard Proxy handling
|
|
156
|
+
entry = self._get_working_entry(
|
|
157
|
+
use_auth=use_auth,
|
|
158
|
+
randomize=randomize,
|
|
159
|
+
check_timeout=check_timeout,
|
|
160
|
+
cooldown_seconds=cooldown_seconds,
|
|
161
|
+
proxy_rotation_interval=proxy_rotation_interval,
|
|
162
|
+
)
|
|
59
163
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
host_port_pairs = [hp.split(":") for hp in hosts]
|
|
63
|
-
current_host, current_port = host_port_pairs[self.current_index]
|
|
64
|
-
self.current_index = (self.current_index + 1) % len(host_port_pairs)
|
|
65
|
-
return current_host, current_port
|
|
164
|
+
if raw:
|
|
165
|
+
return entry
|
|
66
166
|
|
|
67
|
-
|
|
68
|
-
hosts = self.config["hosts"].split(",") if isinstance(self.config["hosts"], str) else self.config["hosts"]
|
|
69
|
-
host_port_pairs = [hp.split(":") for hp in hosts]
|
|
70
|
-
self.current_index = random.randint(0, len(host_port_pairs) - 1)
|
|
71
|
-
return host_port_pairs[self.current_index]
|
|
167
|
+
return self._get_proxies_dict_from_entry(entry, "socks5" if use_socks else "http")
|
|
72
168
|
|
|
73
|
-
def check_current_ip(self):
|
|
169
|
+
def check_current_ip(self, proxies=None):
|
|
170
|
+
"""
|
|
171
|
+
Check the current IP address when using the given proxy.
|
|
172
|
+
"""
|
|
74
173
|
try:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
174
|
+
proxies_arg = proxies or self.proxies
|
|
175
|
+
resp = requests.get(self.CHECK_IP_URL, proxies=proxies_arg, timeout=30)
|
|
176
|
+
return resp.json().get("YourFuckingIPAddress")
|
|
78
177
|
except Exception as ex:
|
|
79
|
-
logger.error(ex)
|
|
178
|
+
logger.error(f"Failed to check IP: {ex}")
|
|
80
179
|
|
|
81
180
|
def renew_tor_ip(self):
|
|
181
|
+
"""
|
|
182
|
+
Request Tor to generate a new exit node IP address.
|
|
183
|
+
"""
|
|
184
|
+
if not self.tor_password:
|
|
185
|
+
logger.error("Tor password not configured")
|
|
186
|
+
return
|
|
187
|
+
|
|
82
188
|
try:
|
|
83
|
-
logger.
|
|
189
|
+
logger.debug(f"Current IP: {self.check_current_ip()}")
|
|
84
190
|
with Controller.from_port(port=9051) as controller:
|
|
85
|
-
controller.authenticate(password=self.
|
|
191
|
+
controller.authenticate(password=self.tor_password)
|
|
86
192
|
controller.signal(Signal.NEWNYM)
|
|
87
|
-
|
|
88
193
|
time.sleep(5)
|
|
89
|
-
logger.
|
|
194
|
+
logger.debug(f"New IP: {self.check_current_ip()}")
|
|
195
|
+
except Exception as ex:
|
|
196
|
+
logger.error(f"Failed to renew Tor IP: {ex}")
|
|
197
|
+
|
|
198
|
+
def wait_for_new_ip(self, timeout=PROXY_ROTATION_TIMEOUT_SECONDS, interval=30, check_timeout=5):
|
|
199
|
+
"""
|
|
200
|
+
Wait for the current proxy to provide a different IP address (proxy IP rotation).
|
|
201
|
+
"""
|
|
202
|
+
if self.current_index == -2:
|
|
203
|
+
logger.debug("No proxy currently selected, selecting one for IP waiting")
|
|
204
|
+
self.get_proxies(raw=True)
|
|
205
|
+
|
|
206
|
+
if self.current_index == -1:
|
|
207
|
+
entry = ("127.0.0.1", "9050", None, None)
|
|
208
|
+
elif 0 <= self.current_index < len(self.entries):
|
|
209
|
+
entry = self.entries[self.current_index]
|
|
210
|
+
else:
|
|
211
|
+
raise RuntimeError("Could not select a proxy for IP waiting")
|
|
212
|
+
|
|
213
|
+
now = time.time()
|
|
214
|
+
interval_seconds = PROXY_ROTATION_INTERVAL.total_seconds()
|
|
215
|
+
last_ts, last_cycle = self._last_ip_wait.get(entry, (None, 0))
|
|
216
|
+
|
|
217
|
+
if last_ts and (now - last_ts) <= interval_seconds and self._traversal_cycle <= last_cycle:
|
|
218
|
+
logger.debug("Skipping wait_for_new_ip: recently checked.")
|
|
219
|
+
return
|
|
90
220
|
|
|
221
|
+
self._last_ip_wait[entry] = (now, self._traversal_cycle)
|
|
222
|
+
|
|
223
|
+
health = self._health.get(entry, {})
|
|
224
|
+
baseline = health.get("last_ip")
|
|
225
|
+
if not baseline:
|
|
226
|
+
try:
|
|
227
|
+
proxies = self._get_proxies_dict_from_entry(entry)
|
|
228
|
+
baseline = self.check_current_ip(proxies)
|
|
229
|
+
except Exception:
|
|
230
|
+
logger.debug("Could not fetch baseline IP for proxy entry")
|
|
231
|
+
|
|
232
|
+
if not baseline:
|
|
233
|
+
raise RuntimeError(f"Could not determine baseline IP for entry {entry[0]}:{entry[1]}")
|
|
234
|
+
|
|
235
|
+
return self._wait_for_new_ip(entry, baseline, timeout, interval, check_timeout)
|
|
236
|
+
|
|
237
|
+
def _mark_entry_status(self, entry, ok, error=None, last_ip=None):
|
|
238
|
+
"""
|
|
239
|
+
Update the health status of a proxy entry.
|
|
240
|
+
"""
|
|
241
|
+
self._health[entry] = {
|
|
242
|
+
"ok": ok,
|
|
243
|
+
"last_checked": time.time(),
|
|
244
|
+
"last_error": error,
|
|
245
|
+
"last_ip": last_ip,
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
def _is_entry_alive(self, entry, timeout=5):
|
|
249
|
+
"""
|
|
250
|
+
Check if a proxy entry is functional.
|
|
251
|
+
"""
|
|
252
|
+
try:
|
|
253
|
+
proxies = self._get_proxies_dict_from_entry(entry)
|
|
254
|
+
resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=timeout)
|
|
255
|
+
ok = resp.status_code == 200
|
|
256
|
+
last_ip = resp.json().get("YourFuckingIPAddress") if ok else None
|
|
257
|
+
self._mark_entry_status(entry, ok, last_ip=last_ip)
|
|
258
|
+
return ok
|
|
91
259
|
except Exception as ex:
|
|
92
|
-
|
|
93
|
-
|
|
260
|
+
self._mark_entry_status(entry, False, str(ex))
|
|
261
|
+
return False
|
|
262
|
+
|
|
263
|
+
def _get_working_entry(
|
|
264
|
+
self,
|
|
265
|
+
use_auth=False,
|
|
266
|
+
randomize=False,
|
|
267
|
+
check_timeout=5,
|
|
268
|
+
cooldown_seconds=30,
|
|
269
|
+
proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
|
|
270
|
+
):
|
|
271
|
+
"""
|
|
272
|
+
Find and return a working proxy entry.
|
|
273
|
+
"""
|
|
274
|
+
if not self.entries:
|
|
275
|
+
raise NoWorkingProxiesError("No proxies available")
|
|
276
|
+
|
|
277
|
+
pool = self._build_pool(use_auth)
|
|
278
|
+
self._pool = pool
|
|
279
|
+
|
|
280
|
+
# Initialize queue: sticky (current) or full refresh
|
|
281
|
+
if not self._automatic_rotation and self.current_index >= 0:
|
|
282
|
+
self._traversal_queue = [self.current_index]
|
|
283
|
+
elif self._automatic_rotation or not self._traversal_queue:
|
|
284
|
+
logger.debug(f"Refreshing rotation queue (randomize={randomize})")
|
|
285
|
+
self._refresh_traversal_queue(pool, randomize)
|
|
286
|
+
|
|
287
|
+
find_once = partial(self._find_working_entry_once, check_timeout, cooldown_seconds)
|
|
288
|
+
|
|
289
|
+
if not proxy_rotation_interval:
|
|
290
|
+
return find_once()
|
|
291
|
+
|
|
292
|
+
def before_sleep(retry_state):
|
|
293
|
+
tenacity.before_sleep_log(logger, logging.INFO)(retry_state)
|
|
294
|
+
|
|
295
|
+
if self._automatic_rotation:
|
|
296
|
+
self._refresh_traversal_queue(pool, randomize)
|
|
297
|
+
elif self.current_index >= 0:
|
|
298
|
+
self._traversal_queue = [self.current_index]
|
|
299
|
+
|
|
300
|
+
retrying = tenacity.Retrying(
|
|
301
|
+
wait=tenacity.wait_fixed(cooldown_seconds),
|
|
302
|
+
stop=tenacity.stop_after_delay(proxy_rotation_interval),
|
|
303
|
+
before_sleep=before_sleep,
|
|
304
|
+
retry=tenacity.retry_if_exception_type(NoWorkingProxiesError),
|
|
305
|
+
reraise=True,
|
|
306
|
+
)
|
|
307
|
+
return retrying(find_once)
|
|
308
|
+
|
|
309
|
+
def _build_pool(self, use_auth):
|
|
310
|
+
"""
|
|
311
|
+
Build a pool of available proxies based on authentication requirements.
|
|
312
|
+
"""
|
|
313
|
+
pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
|
|
314
|
+
return pool or self.entries
|
|
315
|
+
|
|
316
|
+
def _refresh_traversal_queue(self, pool, randomize):
|
|
317
|
+
"""
|
|
318
|
+
Rebuild the proxy traversal queue for the current rotation cycle.
|
|
319
|
+
"""
|
|
320
|
+
current_pool_indices = [idx for idx, entry in enumerate(self.entries) if entry in pool]
|
|
321
|
+
|
|
322
|
+
if not current_pool_indices:
|
|
323
|
+
return
|
|
324
|
+
|
|
325
|
+
if randomize:
|
|
326
|
+
self._traversal_queue = current_pool_indices.copy()
|
|
327
|
+
random.shuffle(self._traversal_queue)
|
|
328
|
+
else:
|
|
329
|
+
# Round-robin: start from next after current_index
|
|
330
|
+
self._traversal_queue = []
|
|
331
|
+
start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
|
|
332
|
+
for i in range(len(self.entries)):
|
|
333
|
+
idx = (start_idx + i) % len(self.entries)
|
|
334
|
+
if idx in current_pool_indices:
|
|
335
|
+
self._traversal_queue.append(idx)
|
|
336
|
+
|
|
337
|
+
self._traversal_start = time.time()
|
|
338
|
+
self._traversal_cycle += 1
|
|
339
|
+
|
|
340
|
+
def _find_working_entry_once(self, check_timeout, cooldown_seconds):
|
|
341
|
+
"""
|
|
342
|
+
Attempt to find a working proxy from the current traversal queue once.
|
|
343
|
+
"""
|
|
344
|
+
for idx in list(self._traversal_queue):
|
|
345
|
+
entry = self.entries[idx]
|
|
346
|
+
health = self._health.get(entry, {})
|
|
347
|
+
last_checked = health.get("last_checked", 0)
|
|
348
|
+
ok = health.get("ok", False)
|
|
349
|
+
now = time.time()
|
|
350
|
+
|
|
351
|
+
is_fresh = (now - last_checked) < cooldown_seconds
|
|
352
|
+
|
|
353
|
+
if ok and is_fresh:
|
|
354
|
+
logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
|
|
355
|
+
self.current_index = idx
|
|
356
|
+
self._traversal_queue.remove(idx)
|
|
357
|
+
return entry
|
|
358
|
+
|
|
359
|
+
if not ok and is_fresh:
|
|
360
|
+
# This proxy failed recently, skip it for this traversal.
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
# Stale or never checked, so we check it.
|
|
364
|
+
logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
|
|
365
|
+
if self._is_entry_alive(entry, timeout=check_timeout):
|
|
366
|
+
self.current_index = idx
|
|
367
|
+
self._traversal_queue.remove(idx)
|
|
368
|
+
return entry
|
|
369
|
+
else:
|
|
370
|
+
# It's dead. Remove it from the queue for this traversal to avoid re-checking.
|
|
371
|
+
self._traversal_queue.remove(idx)
|
|
372
|
+
|
|
373
|
+
raise NoWorkingProxiesError("No working proxies available in current queue")
|
|
374
|
+
|
|
375
|
+
def _wait_for_new_ip(self, entry, baseline, timeout, interval, check_timeout):
|
|
376
|
+
"""
|
|
377
|
+
Poll the proxy repeatedly until its IP address changes from the baseline.
|
|
378
|
+
"""
|
|
379
|
+
logger.info(f"Refreshing proxy IP (current baseline: {baseline})...")
|
|
380
|
+
start = time.time()
|
|
381
|
+
proxies = self._get_proxies_dict_from_entry(entry)
|
|
382
|
+
|
|
383
|
+
while time.time() - start < timeout:
|
|
384
|
+
try:
|
|
385
|
+
resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=check_timeout)
|
|
386
|
+
current_ip = resp.json().get("YourFuckingIPAddress")
|
|
387
|
+
except Exception:
|
|
388
|
+
current_ip = None
|
|
389
|
+
|
|
390
|
+
if current_ip and current_ip != baseline:
|
|
391
|
+
self._mark_entry_status(entry, True, last_ip=current_ip)
|
|
392
|
+
logger.info(f"IP changed from {baseline} to {current_ip}")
|
|
393
|
+
return
|
|
394
|
+
|
|
395
|
+
time.sleep(interval)
|
|
396
|
+
|
|
397
|
+
raise EnsureNewIPTimeoutError(f"Timed out waiting for new IP after {timeout}s")
|
|
@@ -7,7 +7,7 @@ import logging
|
|
|
7
7
|
|
|
8
8
|
import requests
|
|
9
9
|
from requests.exceptions import ConnectionError
|
|
10
|
-
from
|
|
10
|
+
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
|
|
11
11
|
|
|
12
12
|
########################################################################################################################
|
|
13
13
|
# CLASSES
|
|
@@ -43,9 +43,7 @@ class TinybirdInterface:
|
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
def __prepare_json_row(self, obj_dict):
|
|
46
|
-
return json.dumps(
|
|
47
|
-
self.__dict_lists_to_string(obj_dict), default=self.__converter
|
|
48
|
-
)
|
|
46
|
+
return json.dumps(self.__dict_lists_to_string(obj_dict), default=self.__converter)
|
|
49
47
|
|
|
50
48
|
@staticmethod
|
|
51
49
|
def __handle_api_response(json_response):
|
|
@@ -53,28 +51,22 @@ class TinybirdInterface:
|
|
|
53
51
|
quarantined_rows = json_response["quarantined_rows"]
|
|
54
52
|
|
|
55
53
|
if quarantined_rows > 0:
|
|
56
|
-
logger.error(
|
|
57
|
-
f"wrong insertion of {quarantined_rows} records to Tinybird API..."
|
|
58
|
-
)
|
|
54
|
+
logger.error(f"wrong insertion of {quarantined_rows} records to Tinybird API...")
|
|
59
55
|
else:
|
|
60
|
-
logger.info(
|
|
61
|
-
f"successfully inserted {successful_rows} records to Tinybird API!"
|
|
62
|
-
)
|
|
56
|
+
logger.info(f"successfully inserted {successful_rows} records to Tinybird API!")
|
|
63
57
|
|
|
64
58
|
return successful_rows, quarantined_rows
|
|
65
59
|
|
|
66
|
-
@retry(
|
|
60
|
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(2), retry=retry_if_exception_type(ConnectionError))
|
|
67
61
|
def __insert_data_to_endpoint(self, data):
|
|
68
|
-
r = requests.post(self.post_url, params=self.request_params, data=data)
|
|
62
|
+
r = requests.post(self.post_url, params=self.request_params, data=data, timeout=30)
|
|
69
63
|
return self.__handle_api_response(r.json())
|
|
70
64
|
|
|
71
65
|
def insert_record_to_api(self, obj_dict):
|
|
72
66
|
return self.__insert_data_to_endpoint(self.__prepare_json_row(obj_dict))
|
|
73
67
|
|
|
74
68
|
def insert_batch_to_api(self, batch):
|
|
75
|
-
return self.__insert_data_to_endpoint(
|
|
76
|
-
"\n".join([self.__prepare_json_row(x) for x in batch])
|
|
77
|
-
)
|
|
69
|
+
return self.__insert_data_to_endpoint("\n".join([self.__prepare_json_row(x) for x in batch]))
|
|
78
70
|
|
|
79
71
|
def insert_pandas_df_to_api(self, df):
|
|
80
72
|
return self.__insert_data_to_endpoint(df.to_json(orient="records", lines=True))
|