datamarket 0.7.101__py3-none-any.whl → 0.7.103__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/exceptions/main.py +13 -0
- datamarket/interfaces/proxy.py +109 -5
- datamarket/utils/main.py +39 -1
- {datamarket-0.7.101.dist-info → datamarket-0.7.103.dist-info}/METADATA +1 -1
- {datamarket-0.7.101.dist-info → datamarket-0.7.103.dist-info}/RECORD +7 -7
- {datamarket-0.7.101.dist-info → datamarket-0.7.103.dist-info}/LICENSE +0 -0
- {datamarket-0.7.101.dist-info → datamarket-0.7.103.dist-info}/WHEEL +0 -0
datamarket/exceptions/main.py
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
# CLASSES
|
|
3
3
|
|
|
4
4
|
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
|
|
5
8
|
class RedirectionDetectedError(Exception):
|
|
6
9
|
def __init__(self, message="Redirection detected!"):
|
|
7
10
|
self.message = message
|
|
@@ -18,3 +21,13 @@ class BadRequestError(Exception):
|
|
|
18
21
|
def __init__(self, message="Bad request!"):
|
|
19
22
|
self.message = message
|
|
20
23
|
super().__init__(self.message)
|
|
24
|
+
|
|
25
|
+
class ManagedHTTPError(Exception):
|
|
26
|
+
"""Signal that this HTTP status was handled and should not be retried."""
|
|
27
|
+
def __init__(self, response: requests.Response, *, url: str | None = None, message: str | None = None):
|
|
28
|
+
self.response = response
|
|
29
|
+
self.request = getattr(response, "request", None)
|
|
30
|
+
self.status_code = getattr(response, "status_code", None)
|
|
31
|
+
self.url = url or (self.request.url if self.request is not None else None)
|
|
32
|
+
self.message = message
|
|
33
|
+
super().__init__(message or f"HTTP {self.status_code} for {self.url}")
|
datamarket/interfaces/proxy.py
CHANGED
|
@@ -19,6 +19,7 @@ class ProxyInterface:
|
|
|
19
19
|
def __init__(self, config):
|
|
20
20
|
self._load_from_config(config)
|
|
21
21
|
self.current_index = random.randrange(len(self.entries)) if self.entries else 0
|
|
22
|
+
self._health = {} # {entry: {"ok": bool, "last_checked": time.time(), "last_error": str}}
|
|
22
23
|
|
|
23
24
|
def _load_from_config(self, cfg):
|
|
24
25
|
# Tor password (optional)
|
|
@@ -55,23 +56,42 @@ class ProxyInterface:
|
|
|
55
56
|
auth = f"{user}:{password}@" if user and password else ""
|
|
56
57
|
return f"{schema}://{auth}{host}:{port}"
|
|
57
58
|
|
|
58
|
-
def get_proxies(
|
|
59
|
+
def get_proxies(
|
|
60
|
+
self,
|
|
61
|
+
use_tor=False,
|
|
62
|
+
randomize=False,
|
|
63
|
+
raw=False,
|
|
64
|
+
use_auth=False,
|
|
65
|
+
use_socks=False,
|
|
66
|
+
health_check=True,
|
|
67
|
+
check_timeout=5,
|
|
68
|
+
cooldown_seconds=600,
|
|
69
|
+
):
|
|
59
70
|
"""
|
|
60
|
-
Return parsed proxy URLs or raw entry tuple.
|
|
71
|
+
Return parsed proxy URLs or raw entry tuple for a working proxy.
|
|
61
72
|
|
|
62
73
|
:param use_tor: route via local Tor SOCKS5 if True
|
|
63
74
|
:param randomize: select a random proxy if True, otherwise round-robin
|
|
64
75
|
:param raw: return raw (host, port, user, password) tuple if True
|
|
65
76
|
:param use_auth: include proxies that require authentication if True; otherwise only credential-free
|
|
77
|
+
:param health_check: perform health checks to ensure proxy is working if True
|
|
78
|
+
:param check_timeout: timeout in seconds for health check requests
|
|
79
|
+
:param cooldown_seconds: how long to cache health status before re-checking
|
|
66
80
|
"""
|
|
67
|
-
# Tor handling
|
|
81
|
+
# Tor handling (skip health check for tor)
|
|
68
82
|
if use_tor:
|
|
69
83
|
if raw:
|
|
70
84
|
return ("127.0.0.1", "9050", None, None)
|
|
71
85
|
return {"socks5": self.get_proxy_url("127.0.0.1", 9050, schema="socks5")}
|
|
72
86
|
|
|
73
|
-
#
|
|
74
|
-
|
|
87
|
+
# Get a working entry (with health checks if enabled)
|
|
88
|
+
if health_check:
|
|
89
|
+
host, port, user, password = self._get_working_entry(
|
|
90
|
+
use_auth=use_auth, randomize=randomize, check_timeout=check_timeout, cooldown_seconds=cooldown_seconds
|
|
91
|
+
)
|
|
92
|
+
else:
|
|
93
|
+
# Legacy behavior: no health check
|
|
94
|
+
host, port, user, password = self.get_random(use_auth) if randomize else self.get_next(use_auth)
|
|
75
95
|
|
|
76
96
|
if raw:
|
|
77
97
|
return host, port, user, password
|
|
@@ -149,3 +169,87 @@ class ProxyInterface:
|
|
|
149
169
|
except Exception as ex:
|
|
150
170
|
logger.error("Failed to renew Tor IP")
|
|
151
171
|
logger.error(ex)
|
|
172
|
+
|
|
173
|
+
def mark_entry_status(self, entry, ok, error=None):
|
|
174
|
+
"""Update health cache for an entry."""
|
|
175
|
+
self._health[entry] = {
|
|
176
|
+
"ok": ok,
|
|
177
|
+
"last_checked": time.time(),
|
|
178
|
+
"last_error": error,
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
def is_entry_alive(self, entry, timeout=5):
|
|
182
|
+
"""Check if a proxy entry is working by making a test request."""
|
|
183
|
+
host, port, user, pwd = entry
|
|
184
|
+
try:
|
|
185
|
+
proxies = {
|
|
186
|
+
"http": self.get_proxy_url(host, port, user, pwd, "http"),
|
|
187
|
+
"https": self.get_proxy_url(host, port, user, pwd, "http"),
|
|
188
|
+
}
|
|
189
|
+
resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=timeout)
|
|
190
|
+
ok = resp.status_code == 200
|
|
191
|
+
self.mark_entry_status(entry, ok)
|
|
192
|
+
return ok
|
|
193
|
+
except Exception as ex:
|
|
194
|
+
self.mark_entry_status(entry, False, str(ex))
|
|
195
|
+
return False
|
|
196
|
+
|
|
197
|
+
def _get_working_entry(self, use_auth=False, randomize=False, check_timeout=5, cooldown_seconds=60):
|
|
198
|
+
"""Get a working proxy entry, performing health checks as needed."""
|
|
199
|
+
if not self.entries:
|
|
200
|
+
raise RuntimeError("No proxies available")
|
|
201
|
+
|
|
202
|
+
# Build candidate list respecting use_auth and randomize/round-robin
|
|
203
|
+
pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
|
|
204
|
+
if not pool:
|
|
205
|
+
pool = self.entries
|
|
206
|
+
|
|
207
|
+
candidates = pool[:] if randomize else self._get_round_robin_candidates(pool)
|
|
208
|
+
|
|
209
|
+
# First pass: check cache and health
|
|
210
|
+
for entry in candidates:
|
|
211
|
+
health = self._health.get(entry, {})
|
|
212
|
+
last_checked = health.get("last_checked", 0)
|
|
213
|
+
ok = health.get("ok", False)
|
|
214
|
+
now = time.time()
|
|
215
|
+
|
|
216
|
+
if ok and (now - last_checked) < cooldown_seconds:
|
|
217
|
+
# Cached as working and recent
|
|
218
|
+
logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
|
|
219
|
+
return entry
|
|
220
|
+
elif not ok and (now - last_checked) < cooldown_seconds:
|
|
221
|
+
# Cached as failed and recent, skip
|
|
222
|
+
continue
|
|
223
|
+
else:
|
|
224
|
+
# Not cached or expired, check now
|
|
225
|
+
logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
|
|
226
|
+
if self.is_entry_alive(entry, timeout=check_timeout):
|
|
227
|
+
return entry
|
|
228
|
+
|
|
229
|
+
# Second pass: force fresh check for all candidates (in case cache skipped everything)
|
|
230
|
+
logger.warning("No cached working proxies, forcing fresh checks")
|
|
231
|
+
for entry in candidates:
|
|
232
|
+
logger.debug(f"Force-checking proxy: {entry[0]}:{entry[1]}")
|
|
233
|
+
if self.is_entry_alive(entry, timeout=check_timeout):
|
|
234
|
+
return entry
|
|
235
|
+
|
|
236
|
+
# No working proxies found
|
|
237
|
+
raise RuntimeError("No working proxies available")
|
|
238
|
+
|
|
239
|
+
def _get_round_robin_candidates(self, pool):
|
|
240
|
+
"""Get candidates in round-robin order starting from current_index."""
|
|
241
|
+
candidates = []
|
|
242
|
+
start_idx = self.current_index
|
|
243
|
+
for i in range(len(self.entries)):
|
|
244
|
+
idx = (start_idx + i) % len(self.entries)
|
|
245
|
+
entry = self.entries[idx]
|
|
246
|
+
if entry in pool:
|
|
247
|
+
candidates.append(entry)
|
|
248
|
+
# Update current_index for next call
|
|
249
|
+
if candidates:
|
|
250
|
+
try:
|
|
251
|
+
pos = self.entries.index(candidates[0])
|
|
252
|
+
self.current_index = (pos + 1) % len(self.entries)
|
|
253
|
+
except ValueError:
|
|
254
|
+
pass
|
|
255
|
+
return candidates
|
datamarket/utils/main.py
CHANGED
|
@@ -10,6 +10,7 @@ import re
|
|
|
10
10
|
import shlex
|
|
11
11
|
import subprocess
|
|
12
12
|
import time
|
|
13
|
+
from typing import Sequence
|
|
13
14
|
from babel.numbers import parse_decimal
|
|
14
15
|
|
|
15
16
|
from bs4 import BeautifulSoup
|
|
@@ -26,6 +27,8 @@ from tenacity import (
|
|
|
26
27
|
wait_exponential,
|
|
27
28
|
)
|
|
28
29
|
|
|
30
|
+
from datamarket.exceptions.main import ManagedHTTPError
|
|
31
|
+
|
|
29
32
|
from ..exceptions import RedirectionDetectedError, NotFoundError, BadRequestError
|
|
30
33
|
from ..interfaces.proxy import ProxyInterface
|
|
31
34
|
|
|
@@ -132,7 +135,9 @@ def parse_field(dict_struct, field_path, format_method=None):
|
|
|
132
135
|
|
|
133
136
|
|
|
134
137
|
@retry(
|
|
135
|
-
retry=retry_if_not_exception_type(
|
|
138
|
+
retry=retry_if_not_exception_type(
|
|
139
|
+
(NotFoundError, BadRequestError, RedirectionDetectedError, ProxyError, ManagedHTTPError)
|
|
140
|
+
),
|
|
136
141
|
wait=wait_exponential(exp_base=3, multiplier=3, max=60),
|
|
137
142
|
stop=stop_after_attempt(5),
|
|
138
143
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
@@ -146,8 +151,37 @@ def get_data(
|
|
|
146
151
|
proxy_interface: ProxyInterface = None,
|
|
147
152
|
use_auth_proxies: bool = False,
|
|
148
153
|
max_proxy_delay: timedelta = timedelta(minutes=10),
|
|
154
|
+
ignored_status_codes: Sequence[int] = (),
|
|
149
155
|
**kwargs,
|
|
150
156
|
):
|
|
157
|
+
"""
|
|
158
|
+
Fetches data from a given URL using HTTP requests, with support for proxy configuration, retries, and flexible output formats.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
url (str): The target URL to fetch data from.
|
|
162
|
+
method (str, optional): HTTP method to use (e.g., 'GET', 'POST'). Defaults to 'GET'.
|
|
163
|
+
output (str, optional): Output format ('json', 'text', 'soup', 'response'). Defaults to 'json'.
|
|
164
|
+
sleep (tuple, optional): Tuple specifying max and min sleep times (seconds) after request. Defaults to (6, 3).
|
|
165
|
+
proxy_interface (ProxyInterface, optional): Proxy provider. If None, no proxy is used. Defaults to None.
|
|
166
|
+
use_auth_proxies (bool, optional): Whether to use authenticated proxies. Defaults to False.
|
|
167
|
+
max_proxy_delay (timedelta, optional): Maximum delay for proxy retry logic. Defaults to 10 minutes.
|
|
168
|
+
ignored_status_codes (Sequence[int], optional): Status codes to ignore and return response for. Defaults to ().
|
|
169
|
+
**kwargs: Additional arguments passed to the requests method.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Depends on the 'output' argument:
|
|
173
|
+
- 'json': Parsed JSON response.
|
|
174
|
+
- 'text': Response text.
|
|
175
|
+
- 'soup': BeautifulSoup-parsed HTML.
|
|
176
|
+
- 'response': Raw requests.Response object.
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
ManagedHTTPError: If a response status code is in `ignored_status_codes`.
|
|
180
|
+
NotFoundError: If a 404 status code is returned.
|
|
181
|
+
BadRequestError: If a 400 status code is returned.
|
|
182
|
+
RedirectionDetectedError, ProxyError: On specific error conditions.
|
|
183
|
+
requests.HTTPError: For other HTTP errors if not ignored.
|
|
184
|
+
"""
|
|
151
185
|
retry_type = retry_if_exception_type(ProxyError)
|
|
152
186
|
wait = wait_exponential(exp_base=3, multiplier=3, max=60)
|
|
153
187
|
stop = stop_after_delay(max_proxy_delay)
|
|
@@ -174,11 +208,15 @@ def get_data(
|
|
|
174
208
|
|
|
175
209
|
ban_sleep(*sleep)
|
|
176
210
|
|
|
211
|
+
if r.status_code in ignored_status_codes:
|
|
212
|
+
raise ManagedHTTPError(r, url=url, message=f"Status {r.status_code} in ignored_status_codes for URL {url}")
|
|
177
213
|
if r.status_code == 404:
|
|
178
214
|
raise NotFoundError(f"404 Not Found error for {url}")
|
|
179
215
|
if r.status_code == 400:
|
|
180
216
|
raise BadRequestError(f"400 Bad Request error for {url}")
|
|
217
|
+
|
|
181
218
|
r.raise_for_status()
|
|
219
|
+
|
|
182
220
|
r.encoding = "utf-8"
|
|
183
221
|
|
|
184
222
|
if output == "json":
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
datamarket/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
datamarket/exceptions/__init__.py,sha256=-Vu-RZNKjW6fYCLqbUJTkKNuHeA8Yi_gyR50oZNaA_8,33
|
|
3
|
-
datamarket/exceptions/main.py,sha256=
|
|
3
|
+
datamarket/exceptions/main.py,sha256=SuP-ZKZIxJYdnOpNb63Y7BpYGRhLl-4JIyTEqgUoWV4,1205
|
|
4
4
|
datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
datamarket/interfaces/alchemy.py,sha256=i2lKLLLy3-jpbzV3-jxfRCXTy7jRoTsNU3063pmSonk,15749
|
|
6
6
|
datamarket/interfaces/aws.py,sha256=co5JkC3iFIp-0FqdYX4eKy3_m71LhZKuJoW6kXwEImc,4780
|
|
@@ -9,14 +9,14 @@ datamarket/interfaces/drive.py,sha256=3nhx3THr2SHNWKYwme9F2nPpvsqyEMFIxz0whF2FjH
|
|
|
9
9
|
datamarket/interfaces/ftp.py,sha256=K219-PP21EhQo1A1LkvRLahlrw2-pf4svBN0LogZaJE,2813
|
|
10
10
|
datamarket/interfaces/nominatim.py,sha256=TjS9O2U446XuPUzfP65NwDSG-RDNqmYb6-NKikM-34w,15187
|
|
11
11
|
datamarket/interfaces/peerdb.py,sha256=sO451wEGNb_0DDwchZ6eBVYKltqHM5XKau-WsfspXzA,23640
|
|
12
|
-
datamarket/interfaces/proxy.py,sha256=
|
|
12
|
+
datamarket/interfaces/proxy.py,sha256=YNPNDFd2xTF-P5MITRHxGCLiXD8Fal4HK0yN8KhuYgI,9738
|
|
13
13
|
datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
|
|
14
14
|
datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
15
|
datamarket/params/nominatim.py,sha256=RnmYXGoJQCijOsuCavCYcxw98WvOd_vOMK4KaraI0RU,11967
|
|
16
16
|
datamarket/utils/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
|
|
17
17
|
datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
|
|
18
18
|
datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
|
|
19
|
-
datamarket/utils/main.py,sha256=
|
|
19
|
+
datamarket/utils/main.py,sha256=OORsHggUqa2lKj5AG5LTPzEvXfAtx3ry4rSaAwkuS38,8001
|
|
20
20
|
datamarket/utils/nominatim.py,sha256=IxexKY2KOlDhiKtzsqQfoVUjJXPxJl7tn3iHUaQKg08,5795
|
|
21
21
|
datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
datamarket/utils/playwright/async_api.py,sha256=UbA2D4ScBtYeMfrRjly4RO-s8wXIub9c05J1eoOCpsQ,5782
|
|
@@ -29,7 +29,7 @@ datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnm
|
|
|
29
29
|
datamarket/utils/strings/standardization.py,sha256=c8CAG6HI3AfK0hB3A3IGwsbnQebZ6R3PrA5PELHRXM0,1492
|
|
30
30
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
31
31
|
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
32
|
-
datamarket-0.7.
|
|
33
|
-
datamarket-0.7.
|
|
34
|
-
datamarket-0.7.
|
|
35
|
-
datamarket-0.7.
|
|
32
|
+
datamarket-0.7.103.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
33
|
+
datamarket-0.7.103.dist-info/METADATA,sha256=XAS_V3qRLGddVGC6sRhxavgwa4EdzONa1B-YWnvcMK0,7382
|
|
34
|
+
datamarket-0.7.103.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
35
|
+
datamarket-0.7.103.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|