datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (38) hide show
  1. datamarket/__init__.py +0 -1
  2. datamarket/exceptions/__init__.py +1 -0
  3. datamarket/exceptions/main.py +118 -0
  4. datamarket/interfaces/alchemy.py +1934 -25
  5. datamarket/interfaces/aws.py +81 -14
  6. datamarket/interfaces/azure.py +127 -0
  7. datamarket/interfaces/drive.py +60 -10
  8. datamarket/interfaces/ftp.py +37 -14
  9. datamarket/interfaces/llm.py +1220 -0
  10. datamarket/interfaces/nominatim.py +314 -42
  11. datamarket/interfaces/peerdb.py +272 -104
  12. datamarket/interfaces/proxy.py +354 -50
  13. datamarket/interfaces/tinybird.py +7 -15
  14. datamarket/params/nominatim.py +439 -0
  15. datamarket/utils/__init__.py +1 -1
  16. datamarket/utils/airflow.py +10 -7
  17. datamarket/utils/alchemy.py +2 -1
  18. datamarket/utils/logs.py +88 -0
  19. datamarket/utils/main.py +138 -10
  20. datamarket/utils/nominatim.py +201 -0
  21. datamarket/utils/playwright/__init__.py +0 -0
  22. datamarket/utils/playwright/async_api.py +274 -0
  23. datamarket/utils/playwright/sync_api.py +281 -0
  24. datamarket/utils/requests.py +655 -0
  25. datamarket/utils/selenium.py +6 -12
  26. datamarket/utils/strings/__init__.py +1 -0
  27. datamarket/utils/strings/normalization.py +217 -0
  28. datamarket/utils/strings/obfuscation.py +153 -0
  29. datamarket/utils/strings/standardization.py +40 -0
  30. datamarket/utils/typer.py +2 -1
  31. datamarket/utils/types.py +1 -0
  32. datamarket-0.10.3.dist-info/METADATA +172 -0
  33. datamarket-0.10.3.dist-info/RECORD +38 -0
  34. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
  35. datamarket-0.6.0.dist-info/METADATA +0 -49
  36. datamarket-0.6.0.dist-info/RECORD +0 -24
  37. datamarket-0.6.0.dist-info/top_level.txt +0 -1
  38. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
@@ -2,92 +2,396 @@
2
2
  # IMPORTS
3
3
 
4
4
  import logging
5
- import time
6
5
  import random
6
+ import time
7
+ from datetime import timedelta
8
+ from functools import partial
7
9
 
8
10
  import requests
11
+ import tenacity
9
12
  from stem import Signal
10
13
  from stem.control import Controller
11
14
 
15
+ from datamarket.exceptions import EnsureNewIPTimeoutError, NoWorkingProxiesError
16
+
12
17
  ########################################################################################################################
13
- # CLASSES
18
+ # SETUP
14
19
 
15
20
  logger = logging.getLogger(__name__)
16
21
  logging.getLogger("stem").setLevel(logging.WARNING)
17
22
 
23
+ PROXY_ROTATION_INTERVAL = timedelta(minutes=10)
24
+ PROXY_ROTATION_TIMEOUT_SECONDS = int(PROXY_ROTATION_INTERVAL.total_seconds())
25
+
26
+ ########################################################################################################################
27
+ # CLASSES
28
+
18
29
 
19
30
  class ProxyInterface:
31
+ """
32
+ Manage HTTP, HTTPS, and SOCKS5 proxies configured in the [proxy] section.
33
+ """
34
+
20
35
  CHECK_IP_URL = "https://wtfismyip.com/json"
21
36
 
22
37
  def __init__(self, config):
23
- if "proxy" in config:
24
- self.config = config["proxy"]
25
- self.current_index = 0 # Initialize index for round robin
26
- else:
27
- logger.warning("no proxy section in config")
38
+ """
39
+ Initialize the ProxyInterface with configuration.
40
+
41
+ Args:
42
+ config: Configuration object with proxy settings in the [proxy] section.
43
+ Expected to have 'hosts' and optionally 'tor_password' settings.
44
+ """
45
+ self._load_from_config(config)
46
+ self.current_index = -2 # -2: None, -1: Tor, >=0: Index in entries
47
+ self._health = {} # {entry: {"ok": bool, "last_checked": ts, "last_error": str}}
48
+ self._traversal_queue = []
49
+ self._traversal_start = None
50
+ self._last_ip_wait = {}
51
+ self._traversal_cycle = 0
52
+ self._automatic_rotation = True
53
+ self._pool = []
54
+
55
+ def _load_from_config(self, cfg):
56
+ """
57
+ Load proxy configuration from config object.
58
+ """
59
+ self.tor_password = cfg.get("proxy", "tor_password", fallback=None)
60
+ hosts_raw = cfg.get("proxy", "hosts", fallback="")
61
+
62
+ if not hosts_raw:
63
+ raise RuntimeError("[proxy] hosts list is empty")
64
+
65
+ entries = []
66
+ for host_entry in (h.strip() for h in hosts_raw.split(",") if h.strip()):
67
+ host, port, user, password = self._parse_host_entry(host_entry)
68
+ entries.append((host, port, user, password))
69
+
70
+ self.entries = entries
71
+
72
+ def _parse_host_entry(self, host_entry):
73
+ """
74
+ Parse a host entry string into components.
75
+ """
76
+ if "@" in host_entry:
77
+ auth_part, host_part = host_entry.rsplit("@", 1)
78
+ host, port = host_part.split(":")
79
+ user, password = auth_part.split(":", 1)
80
+ return host, port, user, password
81
+ return *host_entry.split(":"), None, None
28
82
 
29
83
  @property
30
84
  def proxies(self):
31
- return self.get_proxies(use_tor="tor_password" in self.config)
85
+ """
86
+ Get current proxies using Tor if configured, otherwise standard proxies.
87
+ """
88
+ return self.get_proxies(use_tor=bool(self.tor_password))
32
89
 
33
- @staticmethod
34
- def get_proxy_url(host, port, user=None, password=None, use_socks=True):
35
- proxy_url = f"{host}:{port}"
90
+ def set_automatic_rotation(self, enable=True):
91
+ """Configures automatic proxy rotation on each request."""
92
+ self._automatic_rotation = enable
36
93
 
37
- if user and password:
38
- proxy_url = f"{user}:{password}@{proxy_url}"
94
+ def rotate_proxies(self, randomize=False, use_auth=False):
95
+ """
96
+ Manually rotate to the next proxy in the pool.
97
+ """
98
+ if not self.entries:
99
+ logger.warning("No proxy entries available to rotate")
100
+ return
39
101
 
40
- proxy_url = f"socks5://{proxy_url}" if use_socks else f"http://{proxy_url}"
41
- return proxy_url
102
+ self._pool = self._build_pool(use_auth)
42
103
 
43
- def get_proxies(self, use_tor=False, randomize=False):
44
- if use_tor:
45
- proxy_url = self.get_proxy_url("127.0.0.1", 9050)
104
+ self._refresh_traversal_queue(self._pool, randomize)
105
+
106
+ if self._traversal_queue:
107
+ next_index = self._traversal_queue[0]
108
+ self.current_index = next_index
109
+ self._traversal_queue.pop(0)
110
+ entry = self.entries[next_index]
111
+ logger.info(f"Rotated to proxy: {entry[0]}:{entry[1]} (index {next_index})")
46
112
  else:
47
- current_host, current_port = self.get_random_host_port() if randomize else self.get_current_host_port()
48
-
49
- user = self.config.get("user")
50
- password = self.config.get("password")
51
- use_socks = self.config.get("socks", "false").lower() == "true"
113
+ logger.warning("Traversal queue is empty, cannot rotate")
52
114
 
53
- proxy_url = self.get_proxy_url(current_host, current_port, user, password, use_socks)
115
+ @staticmethod
116
+ def get_proxy_url(host, port, user=None, password=None, schema="http"):
117
+ """
118
+ Build a proxy URL from components.
119
+ """
120
+ auth = f"{user}:{password}@" if user and password else ""
121
+ return f"{schema}://{auth}{host}:{port}"
54
122
 
55
- return {
56
- "http": proxy_url,
57
- "https": proxy_url,
58
- }
123
+ def _get_proxies_dict_from_entry(self, entry, schema="http"):
124
+ """
125
+ Build a proxy dictionary from an entry tuple.
126
+ """
127
+ host, port, user, pwd = entry
128
+ if schema == "socks5":
129
+ return {"socks5": self.get_proxy_url(host, port, user, pwd, "socks5")}
130
+
131
+ url = self.get_proxy_url(host, port, user, pwd, "http")
132
+ return {"http": url, "https": url}
133
+
134
+ def get_proxies(
135
+ self,
136
+ use_tor=False,
137
+ randomize=False,
138
+ raw=False,
139
+ use_auth=False,
140
+ use_socks=False,
141
+ check_timeout=5,
142
+ cooldown_seconds=30,
143
+ proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
144
+ ):
145
+ """
146
+ Get a working proxy with rotation and health checking.
147
+ """
148
+ # Tor handling
149
+ if use_tor:
150
+ self.current_index = -1
151
+ if raw:
152
+ return ("127.0.0.1", "9050", None, None)
153
+ return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
154
+
155
+ # Standard Proxy handling
156
+ entry = self._get_working_entry(
157
+ use_auth=use_auth,
158
+ randomize=randomize,
159
+ check_timeout=check_timeout,
160
+ cooldown_seconds=cooldown_seconds,
161
+ proxy_rotation_interval=proxy_rotation_interval,
162
+ )
59
163
 
60
- def get_current_host_port(self):
61
- hosts = self.config["hosts"].split(",") if isinstance(self.config["hosts"], str) else self.config["hosts"]
62
- host_port_pairs = [hp.split(":") for hp in hosts]
63
- current_host, current_port = host_port_pairs[self.current_index]
64
- self.current_index = (self.current_index + 1) % len(host_port_pairs)
65
- return current_host, current_port
164
+ if raw:
165
+ return entry
66
166
 
67
- def get_random_host_port(self):
68
- hosts = self.config["hosts"].split(",") if isinstance(self.config["hosts"], str) else self.config["hosts"]
69
- host_port_pairs = [hp.split(":") for hp in hosts]
70
- self.current_index = random.randint(0, len(host_port_pairs) - 1)
71
- return host_port_pairs[self.current_index]
167
+ return self._get_proxies_dict_from_entry(entry, "socks5" if use_socks else "http")
72
168
 
73
- def check_current_ip(self):
169
+ def check_current_ip(self, proxies=None):
170
+ """
171
+ Check the current IP address when using the given proxy.
172
+ """
74
173
  try:
75
- return requests.get(self.CHECK_IP_URL, proxies=self.proxies).json()[
76
- "YourFuckingIPAddress"
77
- ]
174
+ proxies_arg = proxies or self.proxies
175
+ resp = requests.get(self.CHECK_IP_URL, proxies=proxies_arg, timeout=30)
176
+ return resp.json().get("YourFuckingIPAddress")
78
177
  except Exception as ex:
79
- logger.error(ex)
178
+ logger.error(f"Failed to check IP: {ex}")
80
179
 
81
180
  def renew_tor_ip(self):
181
+ """
182
+ Request Tor to generate a new exit node IP address.
183
+ """
184
+ if not self.tor_password:
185
+ logger.error("Tor password not configured")
186
+ return
187
+
82
188
  try:
83
- logger.info(f"renewing Tor ip: {self.check_current_ip()}...")
189
+ logger.debug(f"Current IP: {self.check_current_ip()}")
84
190
  with Controller.from_port(port=9051) as controller:
85
- controller.authenticate(password=self.config["tor_password"])
191
+ controller.authenticate(password=self.tor_password)
86
192
  controller.signal(Signal.NEWNYM)
87
-
88
193
  time.sleep(5)
89
- logger.info(f"new Tor IP: {self.check_current_ip()}")
194
+ logger.debug(f"New IP: {self.check_current_ip()}")
195
+ except Exception as ex:
196
+ logger.error(f"Failed to renew Tor IP: {ex}")
197
+
198
+ def wait_for_new_ip(self, timeout=PROXY_ROTATION_TIMEOUT_SECONDS, interval=30, check_timeout=5):
199
+ """
200
+ Wait for the current proxy to provide a different IP address (proxy IP rotation).
201
+ """
202
+ if self.current_index == -2:
203
+ logger.debug("No proxy currently selected, selecting one for IP waiting")
204
+ self.get_proxies(raw=True)
205
+
206
+ if self.current_index == -1:
207
+ entry = ("127.0.0.1", "9050", None, None)
208
+ elif 0 <= self.current_index < len(self.entries):
209
+ entry = self.entries[self.current_index]
210
+ else:
211
+ raise RuntimeError("Could not select a proxy for IP waiting")
212
+
213
+ now = time.time()
214
+ interval_seconds = PROXY_ROTATION_INTERVAL.total_seconds()
215
+ last_ts, last_cycle = self._last_ip_wait.get(entry, (None, 0))
216
+
217
+ if last_ts and (now - last_ts) <= interval_seconds and self._traversal_cycle <= last_cycle:
218
+ logger.debug("Skipping wait_for_new_ip: recently checked.")
219
+ return
90
220
 
221
+ self._last_ip_wait[entry] = (now, self._traversal_cycle)
222
+
223
+ health = self._health.get(entry, {})
224
+ baseline = health.get("last_ip")
225
+ if not baseline:
226
+ try:
227
+ proxies = self._get_proxies_dict_from_entry(entry)
228
+ baseline = self.check_current_ip(proxies)
229
+ except Exception:
230
+ logger.debug("Could not fetch baseline IP for proxy entry")
231
+
232
+ if not baseline:
233
+ raise RuntimeError(f"Could not determine baseline IP for entry {entry[0]}:{entry[1]}")
234
+
235
+ return self._wait_for_new_ip(entry, baseline, timeout, interval, check_timeout)
236
+
237
+ def _mark_entry_status(self, entry, ok, error=None, last_ip=None):
238
+ """
239
+ Update the health status of a proxy entry.
240
+ """
241
+ self._health[entry] = {
242
+ "ok": ok,
243
+ "last_checked": time.time(),
244
+ "last_error": error,
245
+ "last_ip": last_ip,
246
+ }
247
+
248
+ def _is_entry_alive(self, entry, timeout=5):
249
+ """
250
+ Check if a proxy entry is functional.
251
+ """
252
+ try:
253
+ proxies = self._get_proxies_dict_from_entry(entry)
254
+ resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=timeout)
255
+ ok = resp.status_code == 200
256
+ last_ip = resp.json().get("YourFuckingIPAddress") if ok else None
257
+ self._mark_entry_status(entry, ok, last_ip=last_ip)
258
+ return ok
91
259
  except Exception as ex:
92
- logger.error("unable to renew Tor ip")
93
- logger.error(ex)
260
+ self._mark_entry_status(entry, False, str(ex))
261
+ return False
262
+
263
+ def _get_working_entry(
264
+ self,
265
+ use_auth=False,
266
+ randomize=False,
267
+ check_timeout=5,
268
+ cooldown_seconds=30,
269
+ proxy_rotation_interval=PROXY_ROTATION_INTERVAL,
270
+ ):
271
+ """
272
+ Find and return a working proxy entry.
273
+ """
274
+ if not self.entries:
275
+ raise NoWorkingProxiesError("No proxies available")
276
+
277
+ pool = self._build_pool(use_auth)
278
+ self._pool = pool
279
+
280
+ # Initialize queue: sticky (current) or full refresh
281
+ if not self._automatic_rotation and self.current_index >= 0:
282
+ self._traversal_queue = [self.current_index]
283
+ elif self._automatic_rotation or not self._traversal_queue:
284
+ logger.debug(f"Refreshing rotation queue (randomize={randomize})")
285
+ self._refresh_traversal_queue(pool, randomize)
286
+
287
+ find_once = partial(self._find_working_entry_once, check_timeout, cooldown_seconds)
288
+
289
+ if not proxy_rotation_interval:
290
+ return find_once()
291
+
292
+ def before_sleep(retry_state):
293
+ tenacity.before_sleep_log(logger, logging.INFO)(retry_state)
294
+
295
+ if self._automatic_rotation:
296
+ self._refresh_traversal_queue(pool, randomize)
297
+ elif self.current_index >= 0:
298
+ self._traversal_queue = [self.current_index]
299
+
300
+ retrying = tenacity.Retrying(
301
+ wait=tenacity.wait_fixed(cooldown_seconds),
302
+ stop=tenacity.stop_after_delay(proxy_rotation_interval),
303
+ before_sleep=before_sleep,
304
+ retry=tenacity.retry_if_exception_type(NoWorkingProxiesError),
305
+ reraise=True,
306
+ )
307
+ return retrying(find_once)
308
+
309
+ def _build_pool(self, use_auth):
310
+ """
311
+ Build a pool of available proxies based on authentication requirements.
312
+ """
313
+ pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
314
+ return pool or self.entries
315
+
316
+ def _refresh_traversal_queue(self, pool, randomize):
317
+ """
318
+ Rebuild the proxy traversal queue for the current rotation cycle.
319
+ """
320
+ current_pool_indices = [idx for idx, entry in enumerate(self.entries) if entry in pool]
321
+
322
+ if not current_pool_indices:
323
+ return
324
+
325
+ if randomize:
326
+ self._traversal_queue = current_pool_indices.copy()
327
+ random.shuffle(self._traversal_queue)
328
+ else:
329
+ # Round-robin: start from next after current_index
330
+ self._traversal_queue = []
331
+ start_idx = (self.current_index + 1) % len(self.entries) if self.current_index >= 0 else 0
332
+ for i in range(len(self.entries)):
333
+ idx = (start_idx + i) % len(self.entries)
334
+ if idx in current_pool_indices:
335
+ self._traversal_queue.append(idx)
336
+
337
+ self._traversal_start = time.time()
338
+ self._traversal_cycle += 1
339
+
340
+ def _find_working_entry_once(self, check_timeout, cooldown_seconds):
341
+ """
342
+ Attempt to find a working proxy from the current traversal queue once.
343
+ """
344
+ for idx in list(self._traversal_queue):
345
+ entry = self.entries[idx]
346
+ health = self._health.get(entry, {})
347
+ last_checked = health.get("last_checked", 0)
348
+ ok = health.get("ok", False)
349
+ now = time.time()
350
+
351
+ is_fresh = (now - last_checked) < cooldown_seconds
352
+
353
+ if ok and is_fresh:
354
+ logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
355
+ self.current_index = idx
356
+ self._traversal_queue.remove(idx)
357
+ return entry
358
+
359
+ if not ok and is_fresh:
360
+ # This proxy failed recently, skip it for this traversal.
361
+ continue
362
+
363
+ # Stale or never checked, so we check it.
364
+ logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
365
+ if self._is_entry_alive(entry, timeout=check_timeout):
366
+ self.current_index = idx
367
+ self._traversal_queue.remove(idx)
368
+ return entry
369
+ else:
370
+ # It's dead. Remove it from the queue for this traversal to avoid re-checking.
371
+ self._traversal_queue.remove(idx)
372
+
373
+ raise NoWorkingProxiesError("No working proxies available in current queue")
374
+
375
+ def _wait_for_new_ip(self, entry, baseline, timeout, interval, check_timeout):
376
+ """
377
+ Poll the proxy repeatedly until its IP address changes from the baseline.
378
+ """
379
+ logger.info(f"Refreshing proxy IP (current baseline: {baseline})...")
380
+ start = time.time()
381
+ proxies = self._get_proxies_dict_from_entry(entry)
382
+
383
+ while time.time() - start < timeout:
384
+ try:
385
+ resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=check_timeout)
386
+ current_ip = resp.json().get("YourFuckingIPAddress")
387
+ except Exception:
388
+ current_ip = None
389
+
390
+ if current_ip and current_ip != baseline:
391
+ self._mark_entry_status(entry, True, last_ip=current_ip)
392
+ logger.info(f"IP changed from {baseline} to {current_ip}")
393
+ return
394
+
395
+ time.sleep(interval)
396
+
397
+ raise EnsureNewIPTimeoutError(f"Timed out waiting for new IP after {timeout}s")
@@ -7,7 +7,7 @@ import logging
7
7
 
8
8
  import requests
9
9
  from requests.exceptions import ConnectionError
10
- from retry import retry
10
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
11
11
 
12
12
  ########################################################################################################################
13
13
  # CLASSES
@@ -43,9 +43,7 @@ class TinybirdInterface:
43
43
  }
44
44
 
45
45
  def __prepare_json_row(self, obj_dict):
46
- return json.dumps(
47
- self.__dict_lists_to_string(obj_dict), default=self.__converter
48
- )
46
+ return json.dumps(self.__dict_lists_to_string(obj_dict), default=self.__converter)
49
47
 
50
48
  @staticmethod
51
49
  def __handle_api_response(json_response):
@@ -53,28 +51,22 @@ class TinybirdInterface:
53
51
  quarantined_rows = json_response["quarantined_rows"]
54
52
 
55
53
  if quarantined_rows > 0:
56
- logger.error(
57
- f"wrong insertion of {quarantined_rows} records to Tinybird API..."
58
- )
54
+ logger.error(f"wrong insertion of {quarantined_rows} records to Tinybird API...")
59
55
  else:
60
- logger.info(
61
- f"successfully inserted {successful_rows} records to Tinybird API!"
62
- )
56
+ logger.info(f"successfully inserted {successful_rows} records to Tinybird API!")
63
57
 
64
58
  return successful_rows, quarantined_rows
65
59
 
66
- @retry(ConnectionError, tries=3, delay=2)
60
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(2), retry=retry_if_exception_type(ConnectionError))
67
61
  def __insert_data_to_endpoint(self, data):
68
- r = requests.post(self.post_url, params=self.request_params, data=data)
62
+ r = requests.post(self.post_url, params=self.request_params, data=data, timeout=30)
69
63
  return self.__handle_api_response(r.json())
70
64
 
71
65
  def insert_record_to_api(self, obj_dict):
72
66
  return self.__insert_data_to_endpoint(self.__prepare_json_row(obj_dict))
73
67
 
74
68
  def insert_batch_to_api(self, batch):
75
- return self.__insert_data_to_endpoint(
76
- "\n".join([self.__prepare_json_row(x) for x in batch])
77
- )
69
+ return self.__insert_data_to_endpoint("\n".join([self.__prepare_json_row(x) for x in batch]))
78
70
 
79
71
  def insert_pandas_df_to_api(self, df):
80
72
  return self.__insert_data_to_endpoint(df.to_json(orient="records", lines=True))