datamarket 0.7.89__py3-none-any.whl → 0.7.125__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,31 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
1
4
  import logging
2
- import time
3
5
  import random
6
+ import time
7
+ from datetime import timedelta
8
+ from functools import partial
9
+
4
10
  import requests
11
+ import tenacity
5
12
  from stem import Signal
6
13
  from stem.control import Controller
7
14
 
15
+ from datamarket.exceptions import EnsureNewIPTimeoutError, NoWorkingProxiesError
16
+
17
+ ########################################################################################################################
18
+ # SETUP
19
+
8
20
  logger = logging.getLogger(__name__)
9
21
  logging.getLogger("stem").setLevel(logging.WARNING)
10
22
 
23
+ PROXY_ROTATION_INTERVAL = timedelta(minutes=10)
24
+ PROXY_ROTATION_TIMEOUT_SECONDS = int(PROXY_ROTATION_INTERVAL.total_seconds())
25
+
26
+ ########################################################################################################################
27
+ # CLASSES
28
+
11
29
 
12
30
  class ProxyInterface:
13
31
  """
@@ -17,15 +35,30 @@ class ProxyInterface:
17
35
  CHECK_IP_URL = "https://wtfismyip.com/json"
18
36
 
19
37
  def __init__(self, config):
38
+ """
39
+ Initialize the ProxyInterface with configuration.
40
+
41
+ Args:
42
+ config: Configuration object with proxy settings in the [proxy] section.
43
+ Expected to have 'hosts' and optionally 'tor_password' settings.
44
+ """
20
45
  self._load_from_config(config)
21
- self.current_index = random.randrange(len(self.entries)) if self.entries else 0
46
+ self.current_index = -2 # -2: None, -1: Tor, >=0: Index in entries
47
+ self._health = {} # {entry: {"ok": bool, "last_checked": ts, "last_error": str}}
48
+ self._traversal_queue = []
49
+ self._traversal_start = None
50
+ self._last_ip_wait = {}
51
+ self._traversal_cycle = 0
52
+ self._automatic_rotation = True
53
+ self._pool = []
22
54
 
23
55
  def _load_from_config(self, cfg):
24
- # Tor password (optional)
56
+ """
57
+ Load proxy configuration from config object.
58
+ """
25
59
  self.tor_password = cfg.get("proxy", "tor_password", fallback=None)
26
-
27
- # Comma-separated list of hosts
28
60
  hosts_raw = cfg.get("proxy", "hosts", fallback="")
61
+
29
62
  if not hosts_raw:
30
63
  raise RuntimeError("[proxy] hosts list is empty")
31
64
 
@@ -37,115 +70,328 @@ class ProxyInterface:
37
70
  self.entries = entries
38
71
 
39
72
  def _parse_host_entry(self, host_entry):
73
+ """
74
+ Parse a host entry string into components.
75
+ """
40
76
  if "@" in host_entry:
41
77
  auth_part, host_part = host_entry.rsplit("@", 1)
42
78
  host, port = host_part.split(":")
43
79
  user, password = auth_part.split(":", 1)
44
80
  return host, port, user, password
45
- else:
46
- host, port = host_entry.split(":")
47
- return host, port, None, None
81
+ return *host_entry.split(":"), None, None
48
82
 
49
83
  @property
50
84
  def proxies(self):
85
+ """
86
+ Get current proxies using Tor if configured, otherwise standard proxies.
87
+ """
51
88
  return self.get_proxies(use_tor=bool(self.tor_password))
52
89
 
90
+ def set_automatic_rotation(self, enable=True):
91
+ """Configures automatic proxy rotation on each request."""
92
+ self._automatic_rotation = enable
93
+
94
+ def rotate_proxies(self, randomize=False, use_auth=False):
95
+ """
96
+ Manually rotate to the next proxy in the pool.
97
+ """
98
+ if not self.entries:
99
+ logger.warning("No proxy entries available to rotate")
100
+ return
101
+
102
+ self._pool = self._build_pool(use_auth)
103
+
104
+ self._refresh_traversal_queue(self._pool, randomize)
105
+
106
+ if self._traversal_queue:
107
+ next_index = self._traversal_queue[0]
108
+ self.current_index = next_index
109
+ self._traversal_queue.pop(0)
110
+ entry = self.entries[next_index]
111
+ logger.info(f"Rotated to proxy: {entry[0]}:{entry[1]} (index {next_index})")
112
+ else:
113
+ logger.warning("Traversal queue is empty, cannot rotate")
114
+
53
115
  @staticmethod
54
116
  def get_proxy_url(host, port, user=None, password=None, schema="http"):
117
+ """
118
+ Build a proxy URL from components.
119
+ """
55
120
  auth = f"{user}:{password}@" if user and password else ""
56
121
  return f"{schema}://{auth}{host}:{port}"
57
122
 
58
- def get_proxies(self, use_tor=False, randomize=False, raw=False, use_auth=False, use_socks=False):
123
+ def _get_proxies_dict_from_entry(self, entry, schema="http"):
59
124
  """
60
- Return parsed proxy URLs or raw entry tuple.
125
+ Build a proxy dictionary from an entry tuple.
126
+ """
127
+ host, port, user, pwd = entry
128
+ if schema == "socks5":
129
+ return {"socks5": self.get_proxy_url(host, port, user, pwd, "socks5")}
130
+
131
+ url = self.get_proxy_url(host, port, user, pwd, "http")
132
+ return {"http": url, "https": url}
61
133
 
62
- :param use_tor: route via local Tor SOCKS5 if True
63
- :param randomize: select a random proxy if True, otherwise round-robin
64
- :param raw: return raw (host, port, user, password) tuple if True
65
- :param use_auth: include proxies that require authentication if True; otherwise only credential-free
134
+ def get_proxies(
135
+ self,
136
+ use_tor=False,
137
+ randomize=False,
138
+ raw=False,
139
+ use_auth=False,
140
+ use_socks=False,
141
+ check_timeout=5,
142
+ cooldown_seconds=30,
143
+ proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
144
+ ):
145
+ """
146
+ Get a working proxy with rotation and health checking.
66
147
  """
67
148
  # Tor handling
68
149
  if use_tor:
150
+ self.current_index = -1
69
151
  if raw:
70
152
  return ("127.0.0.1", "9050", None, None)
71
153
  return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
72
154
 
73
- # Select entry based on strategy and auth preference
74
- host, port, user, password = self.get_random(use_auth) if randomize else self.get_next(use_auth)
155
+ # Standard Proxy handling
156
+ entry = self._get_working_entry(
157
+ use_auth=use_auth,
158
+ randomize=randomize,
159
+ check_timeout=check_timeout,
160
+ cooldown_seconds=cooldown_seconds,
161
+ proxy_rotation_interval=proxy_rotation_interval,
162
+ )
75
163
 
76
164
  if raw:
77
- return host, port, user, password
78
-
79
- # Build mapping of proxy URLs
80
- if use_socks:
81
- return {
82
- "socks5": self.get_proxy_url(host, port, user, password, "socks5"),
83
- }
84
- else:
85
- return {
86
- "http": self.get_proxy_url(host, port, user, password, "http"),
87
- "https": self.get_proxy_url(host, port, user, password, "http"),
88
- }
89
-
90
- def get_next(self, use_auth=False):
91
- # Round-robin selection, optionally filtering out authenticated proxies
92
- if not self.entries:
93
- raise RuntimeError("No proxies available")
94
-
95
- pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
96
- if not pool:
97
- pool = self.entries
98
-
99
- # Find next in pool using current_index
100
- for _ in range(len(self.entries)):
101
- idx = self.current_index
102
- self.current_index = (self.current_index + 1) % len(self.entries)
103
- entry = self.entries[idx]
104
- if entry in pool:
105
- return entry
165
+ return entry
106
166
 
107
- # Fallback to first entry
108
- return self.entries[0]
167
+ return self._get_proxies_dict_from_entry(entry, "socks5" if use_socks else "http")
109
168
 
110
- def get_random(self, use_auth=False):
111
- # Random selection, optionally filtering out authenticated proxies
112
- if not self.entries:
113
- raise RuntimeError("No proxies available")
114
-
115
- pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
116
- if not pool:
117
- pool = self.entries
118
-
119
- entry = random.choice(pool)
120
- # Update index to after selected entry for round-robin continuity
121
- try:
122
- pos = self.entries.index(entry)
123
- self.current_index = (pos + 1) % len(self.entries)
124
- except ValueError:
125
- pass
126
-
127
- return entry
128
-
129
- def check_current_ip(self):
169
+ def check_current_ip(self, proxies=None):
170
+ """
171
+ Check the current IP address when using the given proxy.
172
+ """
130
173
  try:
131
- resp = requests.get(self.CHECK_IP_URL, proxies={"http": self.proxies["http"]})
174
+ proxies_arg = proxies or self.proxies
175
+ resp = requests.get(self.CHECK_IP_URL, proxies=proxies_arg, timeout=30)
132
176
  return resp.json().get("YourFuckingIPAddress")
133
177
  except Exception as ex:
134
- logger.error(ex)
178
+ logger.error(f"Failed to check IP: {ex}")
135
179
 
136
180
  def renew_tor_ip(self):
181
+ """
182
+ Request Tor to generate a new exit node IP address.
183
+ """
137
184
  if not self.tor_password:
138
185
  logger.error("Tor password not configured")
139
186
  return
140
187
 
141
188
  try:
142
- logger.info(f"Current IP: {self.check_current_ip()}")
189
+ logger.debug(f"Current IP: {self.check_current_ip()}")
143
190
  with Controller.from_port(port=9051) as controller:
144
191
  controller.authenticate(password=self.tor_password)
145
192
  controller.signal(Signal.NEWNYM)
146
-
147
193
  time.sleep(5)
148
- logger.info(f"New IP: {self.check_current_ip()}")
194
+ logger.debug(f"New IP: {self.check_current_ip()}")
195
+ except Exception as ex:
196
+ logger.error(f"Failed to renew Tor IP: {ex}")
197
+
198
+ def wait_for_new_ip(self, timeout=PROXY_ROTATION_TIMEOUT_SECONDS, interval=30, check_timeout=5):
199
+ """
200
+ Wait for the current proxy to provide a different IP address (proxy IP rotation).
201
+ """
202
+ if self.current_index == -2:
203
+ logger.debug("No proxy currently selected, selecting one for IP waiting")
204
+ self.get_proxies(raw=True)
205
+
206
+ if self.current_index == -1:
207
+ entry = ("127.0.0.1", "9050", None, None)
208
+ elif 0 <= self.current_index < len(self.entries):
209
+ entry = self.entries[self.current_index]
210
+ else:
211
+ raise RuntimeError("Could not select a proxy for IP waiting")
212
+
213
+ now = time.time()
214
+ interval_seconds = PROXY_ROTATION_INTERVAL.total_seconds()
215
+ last_ts, last_cycle = self._last_ip_wait.get(entry, (None, 0))
216
+
217
+ if last_ts and (now - last_ts) <= interval_seconds and self._traversal_cycle <= last_cycle:
218
+ logger.debug("Skipping wait_for_new_ip: recently checked.")
219
+ return
220
+
221
+ self._last_ip_wait[entry] = (now, self._traversal_cycle)
222
+
223
+ health = self._health.get(entry, {})
224
+ baseline = health.get("last_ip")
225
+ if not baseline:
226
+ try:
227
+ proxies = self._get_proxies_dict_from_entry(entry)
228
+ baseline = self.check_current_ip(proxies)
229
+ except Exception:
230
+ logger.debug("Could not fetch baseline IP for proxy entry")
231
+
232
+ if not baseline:
233
+ raise RuntimeError(f"Could not determine baseline IP for entry {entry[0]}:{entry[1]}")
234
+
235
+ return self._wait_for_new_ip(entry, baseline, timeout, interval, check_timeout)
236
+
237
+ def _mark_entry_status(self, entry, ok, error=None, last_ip=None):
238
+ """
239
+ Update the health status of a proxy entry.
240
+ """
241
+ self._health[entry] = {
242
+ "ok": ok,
243
+ "last_checked": time.time(),
244
+ "last_error": error,
245
+ "last_ip": last_ip,
246
+ }
247
+
248
+ def _is_entry_alive(self, entry, timeout=5):
249
+ """
250
+ Check if a proxy entry is functional.
251
+ """
252
+ try:
253
+ proxies = self._get_proxies_dict_from_entry(entry)
254
+ resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=timeout)
255
+ ok = resp.status_code == 200
256
+ last_ip = resp.json().get("YourFuckingIPAddress") if ok else None
257
+ self._mark_entry_status(entry, ok, last_ip=last_ip)
258
+ return ok
149
259
  except Exception as ex:
150
- logger.error("Failed to renew Tor IP")
151
- logger.error(ex)
260
+ self._mark_entry_status(entry, False, str(ex))
261
+ return False
262
+
263
+ def _get_working_entry(
264
+ self,
265
+ use_auth=False,
266
+ randomize=False,
267
+ check_timeout=5,
268
+ cooldown_seconds=30,
269
+ proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
270
+ ):
271
+ """
272
+ Find and return a working proxy entry.
273
+ """
274
+ if not self.entries:
275
+ raise NoWorkingProxiesError("No proxies available")
276
+
277
+ pool = self._build_pool(use_auth)
278
+ self._pool = pool
279
+
280
+ # Initialize queue: sticky (current) or full refresh
281
+ if not self._automatic_rotation and self.current_index >= 0:
282
+ self._traversal_queue = [self.current_index]
283
+ elif self._automatic_rotation or not self._traversal_queue:
284
+ logger.debug(f"Refreshing rotation queue (randomize={randomize})")
285
+ self._refresh_traversal_queue(pool, randomize)
286
+
287
+ find_once = partial(self._find_working_entry_once, check_timeout, cooldown_seconds)
288
+
289
+ if not proxy_rotation_interval:
290
+ return find_once()
291
+
292
+ def before_sleep(retry_state):
293
+ tenacity.before_sleep_log(logger, logging.INFO)(retry_state)
294
+
295
+ if self._automatic_rotation:
296
+ self._refresh_traversal_queue(pool, randomize)
297
+ elif self.current_index >= 0:
298
+ self._traversal_queue = [self.current_index]
299
+
300
+ retrying = tenacity.Retrying(
301
+ wait=tenacity.wait_fixed(cooldown_seconds),
302
+ stop=tenacity.stop_after_delay(proxy_rotation_interval),
303
+ before_sleep=before_sleep,
304
+ retry=tenacity.retry_if_exception_type(NoWorkingProxiesError),
305
+ reraise=True,
306
+ )
307
+ return retrying(find_once)
308
+
309
+ def _build_pool(self, use_auth):
310
+ """
311
+ Build a pool of available proxies based on authentication requirements.
312
+ """
313
+ pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
314
+ return pool or self.entries
315
+
316
+ def _refresh_traversal_queue(self, pool, randomize):
317
+ """
318
+ Rebuild the proxy traversal queue for the current rotation cycle.
319
+ """
320
+ current_pool_indices = [idx for idx, entry in enumerate(self.entries) if entry in pool]
321
+
322
+ if not current_pool_indices:
323
+ return
324
+
325
+ if randomize:
326
+ self._traversal_queue = current_pool_indices.copy()
327
+ random.shuffle(self._traversal_queue)
328
+ else:
329
+ # Round-robin: start from next after current_index
330
+ self._traversal_queue = []
331
+ start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
332
+ for i in range(len(self.entries)):
333
+ idx = (start_idx + i) % len(self.entries)
334
+ if idx in current_pool_indices:
335
+ self._traversal_queue.append(idx)
336
+
337
+ self._traversal_start = time.time()
338
+ self._traversal_cycle += 1
339
+
340
+ def _find_working_entry_once(self, check_timeout, cooldown_seconds):
341
+ """
342
+ Attempt to find a working proxy from the current traversal queue once.
343
+ """
344
+ for idx in list(self._traversal_queue):
345
+ entry = self.entries[idx]
346
+ health = self._health.get(entry, {})
347
+ last_checked = health.get("last_checked", 0)
348
+ ok = health.get("ok", False)
349
+ now = time.time()
350
+
351
+ is_fresh = (now - last_checked) < cooldown_seconds
352
+
353
+ if ok and is_fresh:
354
+ logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
355
+ self.current_index = idx
356
+ self._traversal_queue.remove(idx)
357
+ return entry
358
+
359
+ if not ok and is_fresh:
360
+ # This proxy failed recently, skip it for this traversal.
361
+ continue
362
+
363
+ # Stale or never checked, so we check it.
364
+ logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
365
+ if self._is_entry_alive(entry, timeout=check_timeout):
366
+ self.current_index = idx
367
+ self._traversal_queue.remove(idx)
368
+ return entry
369
+ else:
370
+ # It's dead. Remove it from the queue for this traversal to avoid re-checking.
371
+ self._traversal_queue.remove(idx)
372
+
373
+ raise NoWorkingProxiesError("No working proxies available in current queue")
374
+
375
+ def _wait_for_new_ip(self, entry, baseline, timeout, interval, check_timeout):
376
+ """
377
+ Poll the proxy repeatedly until its IP address changes from the baseline.
378
+ """
379
+ logger.info(f"Refreshing proxy IP (current baseline: {baseline})...")
380
+ start = time.time()
381
+ proxies = self._get_proxies_dict_from_entry(entry)
382
+
383
+ while time.time() - start < timeout:
384
+ try:
385
+ resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=check_timeout)
386
+ current_ip = resp.json().get("YourFuckingIPAddress")
387
+ except Exception:
388
+ current_ip = None
389
+
390
+ if current_ip and current_ip != baseline:
391
+ self._mark_entry_status(entry, True, last_ip=current_ip)
392
+ logger.info(f"IP changed from {baseline} to {current_ip}")
393
+ return
394
+
395
+ time.sleep(interval)
396
+
397
+ raise EnsureNewIPTimeoutError(f"Timed out waiting for new IP after {timeout}s")
@@ -43,9 +43,7 @@ class TinybirdInterface:
43
43
  }
44
44
 
45
45
  def __prepare_json_row(self, obj_dict):
46
- return json.dumps(
47
- self.__dict_lists_to_string(obj_dict), default=self.__converter
48
- )
46
+ return json.dumps(self.__dict_lists_to_string(obj_dict), default=self.__converter)
49
47
 
50
48
  @staticmethod
51
49
  def __handle_api_response(json_response):
@@ -53,13 +51,9 @@ class TinybirdInterface:
53
51
  quarantined_rows = json_response["quarantined_rows"]
54
52
 
55
53
  if quarantined_rows > 0:
56
- logger.error(
57
- f"wrong insertion of {quarantined_rows} records to Tinybird API..."
58
- )
54
+ logger.error(f"wrong insertion of {quarantined_rows} records to Tinybird API...")
59
55
  else:
60
- logger.info(
61
- f"successfully inserted {successful_rows} records to Tinybird API!"
62
- )
56
+ logger.info(f"successfully inserted {successful_rows} records to Tinybird API!")
63
57
 
64
58
  return successful_rows, quarantined_rows
65
59
 
@@ -72,9 +66,7 @@ class TinybirdInterface:
72
66
  return self.__insert_data_to_endpoint(self.__prepare_json_row(obj_dict))
73
67
 
74
68
  def insert_batch_to_api(self, batch):
75
- return self.__insert_data_to_endpoint(
76
- "\n".join([self.__prepare_json_row(x) for x in batch])
77
- )
69
+ return self.__insert_data_to_endpoint("\n".join([self.__prepare_json_row(x) for x in batch]))
78
70
 
79
71
  def insert_pandas_df_to_api(self, df):
80
72
  return self.__insert_data_to_endpoint(df.to_json(orient="records", lines=True))