datamarket 0.7.103__tar.gz → 0.7.104__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- {datamarket-0.7.103 → datamarket-0.7.104}/PKG-INFO +1 -1
- {datamarket-0.7.103 → datamarket-0.7.104}/pyproject.toml +1 -1
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/exceptions/main.py +15 -1
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/interfaces/alchemy.py +11 -12
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/interfaces/aws.py +2 -1
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/interfaces/azure.py +6 -12
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/interfaces/ftp.py +3 -11
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/interfaces/nominatim.py +7 -9
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/interfaces/proxy.py +118 -38
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/interfaces/tinybird.py +4 -12
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/params/nominatim.py +10 -13
- datamarket-0.7.104/src/datamarket/utils/__init__.py +1 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/airflow.py +10 -7
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/alchemy.py +2 -1
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/main.py +4 -4
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/nominatim.py +18 -21
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/playwright/async_api.py +5 -2
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/playwright/sync_api.py +7 -2
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/selenium.py +6 -12
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/strings/normalization.py +5 -11
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/strings/standardization.py +4 -2
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/typer.py +2 -1
- datamarket-0.7.103/src/datamarket/exceptions/__init__.py +0 -1
- {datamarket-0.7.103 → datamarket-0.7.104}/LICENSE +0 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/README.md +0 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/__init__.py +0 -0
- {datamarket-0.7.103/src/datamarket/utils → datamarket-0.7.104/src/datamarket/exceptions}/__init__.py +0 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/interfaces/__init__.py +0 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/interfaces/drive.py +0 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/interfaces/peerdb.py +0 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/params/__init__.py +0 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/playwright/__init__.py +0 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/soda.py +0 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/strings/__init__.py +0 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/strings/obfuscation.py +0 -0
- {datamarket-0.7.103 → datamarket-0.7.104}/src/datamarket/utils/types.py +0 -0
|
@@ -21,9 +21,11 @@ class BadRequestError(Exception):
|
|
|
21
21
|
def __init__(self, message="Bad request!"):
|
|
22
22
|
self.message = message
|
|
23
23
|
super().__init__(self.message)
|
|
24
|
-
|
|
24
|
+
|
|
25
|
+
|
|
25
26
|
class ManagedHTTPError(Exception):
|
|
26
27
|
"""Signal that this HTTP status was handled and should not be retried."""
|
|
28
|
+
|
|
27
29
|
def __init__(self, response: requests.Response, *, url: str | None = None, message: str | None = None):
|
|
28
30
|
self.response = response
|
|
29
31
|
self.request = getattr(response, "request", None)
|
|
@@ -31,3 +33,15 @@ class ManagedHTTPError(Exception):
|
|
|
31
33
|
self.url = url or (self.request.url if self.request is not None else None)
|
|
32
34
|
self.message = message
|
|
33
35
|
super().__init__(message or f"HTTP {self.status_code} for {self.url}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class NoWorkingProxiesError(Exception):
|
|
39
|
+
def __init__(self, message="No working proxies available"):
|
|
40
|
+
self.message = message
|
|
41
|
+
super().__init__(self.message)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class EnsureNewIPTimeoutError(Exception):
|
|
45
|
+
def __init__(self, message="Timed out waiting for new IP"):
|
|
46
|
+
self.message = message
|
|
47
|
+
super().__init__(self.message)
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import logging
|
|
5
5
|
from collections.abc import MutableMapping
|
|
6
|
+
from enum import Enum, auto
|
|
6
7
|
from typing import Any, Iterator, List, Optional, Type, TypeVar, Union
|
|
7
8
|
from urllib.parse import quote_plus
|
|
8
9
|
|
|
@@ -12,7 +13,6 @@ from sqlalchemy.exc import IntegrityError
|
|
|
12
13
|
from sqlalchemy.ext.declarative import DeclarativeMeta
|
|
13
14
|
from sqlalchemy.orm import Session, sessionmaker
|
|
14
15
|
from sqlalchemy.sql.expression import ClauseElement
|
|
15
|
-
from enum import Enum, auto
|
|
16
16
|
|
|
17
17
|
########################################################################################################################
|
|
18
18
|
# CLASSES
|
|
@@ -198,29 +198,28 @@ class AlchemyInterface:
|
|
|
198
198
|
@staticmethod
|
|
199
199
|
def _log_integrity_error(ex: IntegrityError, alchemy_obj, action="insert"):
|
|
200
200
|
"""
|
|
201
|
-
|
|
202
|
-
|
|
201
|
+
Compact, readable IntegrityError logger using SQLSTATE codes.
|
|
202
|
+
Consult https://www.postgresql.org/docs/current/errcodes-appendix.html for details.
|
|
203
203
|
"""
|
|
204
204
|
|
|
205
205
|
PG_ERROR_LABELS = {
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
206
|
+
"23000": "Integrity constraint violation",
|
|
207
|
+
"23001": "Restrict violation",
|
|
208
|
+
"23502": "NOT NULL violation",
|
|
209
|
+
"23503": "Foreign key violation",
|
|
210
|
+
"23505": "Unique violation",
|
|
211
|
+
"23514": "Check constraint violation",
|
|
212
|
+
"23P01": "Exclusion constraint violation",
|
|
213
213
|
}
|
|
214
214
|
code = getattr(ex.orig, "pgcode", None)
|
|
215
215
|
label = PG_ERROR_LABELS.get(code, "Integrity error (unspecified)")
|
|
216
216
|
|
|
217
217
|
# Log one clean message with trace + the raw DB message separately
|
|
218
|
-
if code == "23505":
|
|
218
|
+
if code == "23505": # A simple info log for unique violations
|
|
219
219
|
logger.info(f"{label} trying to {action} {alchemy_obj}")
|
|
220
220
|
else:
|
|
221
221
|
logger.error(f"{label} trying to {action} {alchemy_obj}\nPostgreSQL message: {ex.orig}")
|
|
222
222
|
|
|
223
|
-
|
|
224
223
|
def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> bool:
|
|
225
224
|
if self.session is None:
|
|
226
225
|
raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
|
|
@@ -3,8 +3,9 @@
|
|
|
3
3
|
|
|
4
4
|
import io
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
6
8
|
import boto3
|
|
7
|
-
from typing import Optional, List, Dict, Any
|
|
8
9
|
|
|
9
10
|
########################################################################################################################
|
|
10
11
|
# CLASSES
|
|
@@ -29,17 +29,15 @@ class AzureBlobInterface:
|
|
|
29
29
|
{
|
|
30
30
|
"profile": profile_name,
|
|
31
31
|
"container_name": container_name,
|
|
32
|
-
"session": BlobServiceClient.from_connection_string(
|
|
33
|
-
|
|
34
|
-
)
|
|
32
|
+
"session": BlobServiceClient.from_connection_string(connection_string).get_container_client(
|
|
33
|
+
container_name
|
|
34
|
+
),
|
|
35
35
|
}
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
if not self.profiles:
|
|
39
39
|
logger.warning("No Azure profiles found in config file")
|
|
40
|
-
self.current_profile: Optional[Dict[str, Any]] =
|
|
41
|
-
self.profiles[0] if self.profiles else None
|
|
42
|
-
)
|
|
40
|
+
self.current_profile: Optional[Dict[str, Any]] = self.profiles[0] if self.profiles else None
|
|
43
41
|
|
|
44
42
|
def switch_profile(self, profile_name: str) -> None:
|
|
45
43
|
for profile in self.profiles:
|
|
@@ -109,14 +107,10 @@ class AzureBlobInterface:
|
|
|
109
107
|
if blob_client.exists():
|
|
110
108
|
properties = blob_client.get_blob_properties()
|
|
111
109
|
if properties.size > 100: # Check if size is greater than 100 bytes
|
|
112
|
-
logger.debug(
|
|
113
|
-
f"Blob '{remote_path}' exists and is not empty (size: {properties.size})."
|
|
114
|
-
)
|
|
110
|
+
logger.debug(f"Blob '{remote_path}' exists and is not empty (size: {properties.size}).")
|
|
115
111
|
return True
|
|
116
112
|
else:
|
|
117
|
-
logger.debug(
|
|
118
|
-
f"Blob '{remote_path}' exists but size ({properties.size}) is not > 100 bytes."
|
|
119
|
-
)
|
|
113
|
+
logger.debug(f"Blob '{remote_path}' exists but size ({properties.size}) is not > 100 bytes.")
|
|
120
114
|
return False
|
|
121
115
|
else:
|
|
122
116
|
logger.debug(f"Blob '{remote_path}' does not exist.")
|
|
@@ -20,22 +20,14 @@ class FTPInterface:
|
|
|
20
20
|
if section.startswith("ftp:"):
|
|
21
21
|
profile_name = section.split(":", 1)[1]
|
|
22
22
|
ftps = self.config[section]["ftps"].lower() == "true"
|
|
23
|
-
ftp_conn = (
|
|
24
|
-
|
|
25
|
-
if ftps
|
|
26
|
-
else FTP(self.config[section]["server"])
|
|
27
|
-
) # noqa: S321
|
|
28
|
-
ftp_conn.login(
|
|
29
|
-
self.config[section]["username"], self.config[section]["password"]
|
|
30
|
-
)
|
|
23
|
+
ftp_conn = FTP_TLS(self.config[section]["server"]) if ftps else FTP(self.config[section]["server"]) # noqa: S321
|
|
24
|
+
ftp_conn.login(self.config[section]["username"], self.config[section]["password"])
|
|
31
25
|
self.profiles.append({"profile": profile_name, "session": ftp_conn})
|
|
32
26
|
|
|
33
27
|
if not self.profiles:
|
|
34
28
|
logger.warning("no ftp section in config")
|
|
35
29
|
|
|
36
|
-
self.current_profile: Optional[Dict[str, Any]] =
|
|
37
|
-
self.profiles[0] if self.profiles else None
|
|
38
|
-
)
|
|
30
|
+
self.current_profile: Optional[Dict[str, Any]] = self.profiles[0] if self.profiles else None
|
|
39
31
|
self.ftp = self.current_profile["session"] if self.current_profile else None
|
|
40
32
|
|
|
41
33
|
def switch_profile(self, profile_name: str) -> None:
|
|
@@ -11,8 +11,8 @@ from geopy.distance import geodesic
|
|
|
11
11
|
from jellyfish import jaro_winkler_similarity
|
|
12
12
|
|
|
13
13
|
from ..params.nominatim import CITY_TO_PROVINCE, POSTCODES
|
|
14
|
-
from ..utils.strings import normalize
|
|
15
14
|
from ..utils.nominatim import standardize_admin_division
|
|
15
|
+
from ..utils.strings import normalize
|
|
16
16
|
|
|
17
17
|
########################################################################################################################
|
|
18
18
|
# PARAMETERS
|
|
@@ -335,16 +335,14 @@ class Nominatim:
|
|
|
335
335
|
selected_province_from_postcode,
|
|
336
336
|
selected_state,
|
|
337
337
|
)
|
|
338
|
-
|
|
338
|
+
|
|
339
339
|
# Standardize
|
|
340
340
|
final_result["province"] = standardize_admin_division(
|
|
341
|
-
name=final_result["province"],
|
|
342
|
-
|
|
343
|
-
country_code=final_result["country_code"])
|
|
341
|
+
name=final_result["province"], level="province", country_code=final_result["country_code"]
|
|
342
|
+
)
|
|
344
343
|
final_result["state"] = standardize_admin_division(
|
|
345
|
-
name=final_result["state"],
|
|
346
|
-
|
|
347
|
-
country_code=final_result["country_code"])
|
|
344
|
+
name=final_result["state"], level="state", country_code=final_result["country_code"]
|
|
345
|
+
)
|
|
348
346
|
return final_result
|
|
349
347
|
|
|
350
348
|
|
|
@@ -358,4 +356,4 @@ class NominatimInterface(Nominatim):
|
|
|
358
356
|
|
|
359
357
|
super().__init__(self.nominatim_endpoint, self.geonames_endpoint)
|
|
360
358
|
else:
|
|
361
|
-
logger.warning("no osm section in config")
|
|
359
|
+
logger.warning("no osm section in config")
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import time
|
|
3
2
|
import random
|
|
3
|
+
import time
|
|
4
|
+
|
|
4
5
|
import requests
|
|
6
|
+
import tenacity
|
|
5
7
|
from stem import Signal
|
|
6
8
|
from stem.control import Controller
|
|
7
9
|
|
|
10
|
+
from datamarket.exceptions import EnsureNewIPTimeoutError, NoWorkingProxiesError
|
|
11
|
+
|
|
8
12
|
logger = logging.getLogger(__name__)
|
|
9
13
|
logging.getLogger("stem").setLevel(logging.WARNING)
|
|
10
14
|
|
|
@@ -18,7 +22,7 @@ class ProxyInterface:
|
|
|
18
22
|
|
|
19
23
|
def __init__(self, config):
|
|
20
24
|
self._load_from_config(config)
|
|
21
|
-
self.current_index = random.randrange(len(self.entries)) if self.entries else 0
|
|
25
|
+
self.current_index = random.randrange(len(self.entries)) if self.entries else 0 # noqa: S311
|
|
22
26
|
self._health = {} # {entry: {"ok": bool, "last_checked": time.time(), "last_error": str}}
|
|
23
27
|
|
|
24
28
|
def _load_from_config(self, cfg):
|
|
@@ -66,6 +70,10 @@ class ProxyInterface:
|
|
|
66
70
|
health_check=True,
|
|
67
71
|
check_timeout=5,
|
|
68
72
|
cooldown_seconds=600,
|
|
73
|
+
ensure_new_ip=False,
|
|
74
|
+
ensure_new_ip_timeout=600,
|
|
75
|
+
ensure_new_ip_interval=5,
|
|
76
|
+
max_retry_seconds=600,
|
|
69
77
|
):
|
|
70
78
|
"""
|
|
71
79
|
Return parsed proxy URLs or raw entry tuple for a working proxy.
|
|
@@ -77,6 +85,10 @@ class ProxyInterface:
|
|
|
77
85
|
:param health_check: perform health checks to ensure proxy is working if True
|
|
78
86
|
:param check_timeout: timeout in seconds for health check requests
|
|
79
87
|
:param cooldown_seconds: how long to cache health status before re-checking
|
|
88
|
+
:param ensure_new_ip: if True and only one proxy available, wait until IP changes before returning
|
|
89
|
+
:param ensure_new_ip_timeout: max seconds to wait for IP change when ensure_new_ip=True
|
|
90
|
+
:param ensure_new_ip_interval: seconds between IP checks when ensure_new_ip=True
|
|
91
|
+
:param max_retry_seconds: max seconds to retry finding working proxies (0 to disable)
|
|
80
92
|
"""
|
|
81
93
|
# Tor handling (skip health check for tor)
|
|
82
94
|
if use_tor:
|
|
@@ -87,7 +99,14 @@ class ProxyInterface:
|
|
|
87
99
|
# Get a working entry (with health checks if enabled)
|
|
88
100
|
if health_check:
|
|
89
101
|
host, port, user, password = self._get_working_entry(
|
|
90
|
-
use_auth=use_auth,
|
|
102
|
+
use_auth=use_auth,
|
|
103
|
+
randomize=randomize,
|
|
104
|
+
check_timeout=check_timeout,
|
|
105
|
+
cooldown_seconds=cooldown_seconds,
|
|
106
|
+
ensure_new_ip=ensure_new_ip,
|
|
107
|
+
ensure_new_ip_timeout=ensure_new_ip_timeout,
|
|
108
|
+
ensure_new_ip_interval=ensure_new_ip_interval,
|
|
109
|
+
max_retry_seconds=max_retry_seconds,
|
|
91
110
|
)
|
|
92
111
|
else:
|
|
93
112
|
# Legacy behavior: no health check
|
|
@@ -110,7 +129,7 @@ class ProxyInterface:
|
|
|
110
129
|
def get_next(self, use_auth=False):
|
|
111
130
|
# Round-robin selection, optionally filtering out authenticated proxies
|
|
112
131
|
if not self.entries:
|
|
113
|
-
raise
|
|
132
|
+
raise NoWorkingProxiesError("No proxies available")
|
|
114
133
|
|
|
115
134
|
pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
|
|
116
135
|
if not pool:
|
|
@@ -130,13 +149,13 @@ class ProxyInterface:
|
|
|
130
149
|
def get_random(self, use_auth=False):
|
|
131
150
|
# Random selection, optionally filtering out authenticated proxies
|
|
132
151
|
if not self.entries:
|
|
133
|
-
raise
|
|
152
|
+
raise NoWorkingProxiesError("No proxies available")
|
|
134
153
|
|
|
135
154
|
pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
|
|
136
155
|
if not pool:
|
|
137
156
|
pool = self.entries
|
|
138
157
|
|
|
139
|
-
entry = random.choice(pool)
|
|
158
|
+
entry = random.choice(pool) # noqa: S311
|
|
140
159
|
# Update index to after selected entry for round-robin continuity
|
|
141
160
|
try:
|
|
142
161
|
pos = self.entries.index(entry)
|
|
@@ -146,9 +165,10 @@ class ProxyInterface:
|
|
|
146
165
|
|
|
147
166
|
return entry
|
|
148
167
|
|
|
149
|
-
def check_current_ip(self):
|
|
168
|
+
def check_current_ip(self, proxies=None):
|
|
150
169
|
try:
|
|
151
|
-
|
|
170
|
+
proxies_arg = proxies or {"http": self.proxies["http"]}
|
|
171
|
+
resp = requests.get(self.CHECK_IP_URL, proxies=proxies_arg, timeout=30)
|
|
152
172
|
return resp.json().get("YourFuckingIPAddress")
|
|
153
173
|
except Exception as ex:
|
|
154
174
|
logger.error(ex)
|
|
@@ -170,12 +190,13 @@ class ProxyInterface:
|
|
|
170
190
|
logger.error("Failed to renew Tor IP")
|
|
171
191
|
logger.error(ex)
|
|
172
192
|
|
|
173
|
-
def mark_entry_status(self, entry, ok, error=None):
|
|
193
|
+
def mark_entry_status(self, entry, ok, error=None, last_ip=None):
|
|
174
194
|
"""Update health cache for an entry."""
|
|
175
195
|
self._health[entry] = {
|
|
176
196
|
"ok": ok,
|
|
177
197
|
"last_checked": time.time(),
|
|
178
198
|
"last_error": error,
|
|
199
|
+
"last_ip": last_ip,
|
|
179
200
|
}
|
|
180
201
|
|
|
181
202
|
def is_entry_alive(self, entry, timeout=5):
|
|
@@ -188,25 +209,84 @@ class ProxyInterface:
|
|
|
188
209
|
}
|
|
189
210
|
resp = requests.get(self.CHECK_IP_URL, proxies=proxies, timeout=timeout)
|
|
190
211
|
ok = resp.status_code == 200
|
|
191
|
-
|
|
212
|
+
last_ip = resp.json().get("YourFuckingIPAddress") if ok else None
|
|
213
|
+
self.mark_entry_status(entry, ok, last_ip=last_ip)
|
|
192
214
|
return ok
|
|
193
215
|
except Exception as ex:
|
|
194
216
|
self.mark_entry_status(entry, False, str(ex))
|
|
195
217
|
return False
|
|
196
218
|
|
|
197
|
-
def _get_working_entry(
|
|
219
|
+
def _get_working_entry(
|
|
220
|
+
self,
|
|
221
|
+
use_auth=False,
|
|
222
|
+
randomize=False,
|
|
223
|
+
check_timeout=5,
|
|
224
|
+
cooldown_seconds=60,
|
|
225
|
+
ensure_new_ip=False,
|
|
226
|
+
ensure_new_ip_timeout=600,
|
|
227
|
+
ensure_new_ip_interval=5,
|
|
228
|
+
max_retry_seconds=600,
|
|
229
|
+
):
|
|
198
230
|
"""Get a working proxy entry, performing health checks as needed."""
|
|
199
|
-
|
|
200
|
-
|
|
231
|
+
pool = self._build_pool(use_auth)
|
|
232
|
+
candidates = self._get_candidates(pool, randomize)
|
|
233
|
+
|
|
234
|
+
def _find_working_entry():
|
|
235
|
+
if not self.entries:
|
|
236
|
+
raise NoWorkingProxiesError("No proxies available")
|
|
237
|
+
return self._find_working_entry_once(candidates, check_timeout, cooldown_seconds)
|
|
238
|
+
|
|
239
|
+
if max_retry_seconds > 0:
|
|
240
|
+
retrying = tenacity.Retrying(
|
|
241
|
+
stop=tenacity.stop_after_delay(max_retry_seconds),
|
|
242
|
+
reraise=True,
|
|
243
|
+
)
|
|
244
|
+
entry = retrying(_find_working_entry)
|
|
245
|
+
else:
|
|
246
|
+
entry = _find_working_entry()
|
|
247
|
+
|
|
248
|
+
if ensure_new_ip and len(pool) == 1:
|
|
249
|
+
logger.info(f"ensure_new_ip=True and single proxy, waiting for IP change: {entry[0]}:{entry[1]}")
|
|
250
|
+
baseline = self._health.get(entry, {}).get("last_ip")
|
|
251
|
+
if not baseline:
|
|
252
|
+
if not self.is_entry_alive(entry, timeout=check_timeout):
|
|
253
|
+
raise NoWorkingProxiesError("Proxy became unavailable during ensure_new_ip")
|
|
254
|
+
baseline = self._health.get(entry, {}).get("last_ip")
|
|
255
|
+
entry = self._wait_for_new_ip(entry, baseline, ensure_new_ip_timeout, ensure_new_ip_interval, check_timeout)
|
|
256
|
+
|
|
257
|
+
return entry
|
|
258
|
+
|
|
259
|
+
def _get_round_robin_candidates(self, pool):
|
|
260
|
+
"""Get candidates in round-robin order starting from current_index."""
|
|
261
|
+
candidates = []
|
|
262
|
+
start_idx = self.current_index
|
|
263
|
+
for i in range(len(self.entries)):
|
|
264
|
+
idx = (start_idx + i) % len(self.entries)
|
|
265
|
+
entry = self.entries[idx]
|
|
266
|
+
if entry in pool:
|
|
267
|
+
candidates.append(entry)
|
|
268
|
+
# Update current_index for next call
|
|
269
|
+
if candidates:
|
|
270
|
+
try:
|
|
271
|
+
pos = self.entries.index(candidates[0])
|
|
272
|
+
self.current_index = (pos + 1) % len(self.entries)
|
|
273
|
+
except ValueError:
|
|
274
|
+
pass
|
|
275
|
+
return candidates
|
|
201
276
|
|
|
202
|
-
|
|
277
|
+
def _build_pool(self, use_auth):
|
|
203
278
|
pool = self.entries if use_auth else [e for e in self.entries if not e[2] and not e[3]]
|
|
204
279
|
if not pool:
|
|
205
280
|
pool = self.entries
|
|
281
|
+
return pool
|
|
206
282
|
|
|
207
|
-
|
|
283
|
+
def _get_candidates(self, pool, randomize):
|
|
284
|
+
if randomize:
|
|
285
|
+
return pool[:]
|
|
286
|
+
else:
|
|
287
|
+
return self._get_round_robin_candidates(pool)
|
|
208
288
|
|
|
209
|
-
|
|
289
|
+
def _find_working_entry_once(self, candidates, check_timeout, cooldown_seconds):
|
|
210
290
|
for entry in candidates:
|
|
211
291
|
health = self._health.get(entry, {})
|
|
212
292
|
last_checked = health.get("last_checked", 0)
|
|
@@ -214,42 +294,42 @@ class ProxyInterface:
|
|
|
214
294
|
now = time.time()
|
|
215
295
|
|
|
216
296
|
if ok and (now - last_checked) < cooldown_seconds:
|
|
217
|
-
# Cached as working and recent
|
|
218
297
|
logger.debug(f"Using cached working proxy: {entry[0]}:{entry[1]}")
|
|
219
298
|
return entry
|
|
220
299
|
elif not ok and (now - last_checked) < cooldown_seconds:
|
|
221
|
-
# Cached as failed and recent, skip
|
|
222
300
|
continue
|
|
223
301
|
else:
|
|
224
|
-
# Not cached or expired, check now
|
|
225
302
|
logger.debug(f"Checking proxy health: {entry[0]}:{entry[1]}")
|
|
226
303
|
if self.is_entry_alive(entry, timeout=check_timeout):
|
|
227
304
|
return entry
|
|
228
305
|
|
|
229
|
-
# Second pass: force fresh check for all candidates (in case cache skipped everything)
|
|
230
306
|
logger.warning("No cached working proxies, forcing fresh checks")
|
|
231
307
|
for entry in candidates:
|
|
232
308
|
logger.debug(f"Force-checking proxy: {entry[0]}:{entry[1]}")
|
|
233
309
|
if self.is_entry_alive(entry, timeout=check_timeout):
|
|
234
310
|
return entry
|
|
235
311
|
|
|
236
|
-
|
|
237
|
-
raise RuntimeError("No working proxies available")
|
|
312
|
+
raise NoWorkingProxiesError("No working proxies available")
|
|
238
313
|
|
|
239
|
-
def
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
candidates.append(entry)
|
|
248
|
-
# Update current_index for next call
|
|
249
|
-
if candidates:
|
|
314
|
+
def _wait_for_new_ip(self, entry, baseline, timeout, interval, check_timeout):
|
|
315
|
+
start = time.time()
|
|
316
|
+
while time.time() - start < timeout:
|
|
317
|
+
host, port, user, pwd = entry
|
|
318
|
+
proxies_map = {
|
|
319
|
+
"http": self.get_proxy_url(host, port, user, pwd, "http"),
|
|
320
|
+
"https": self.get_proxy_url(host, port, user, pwd, "http"),
|
|
321
|
+
}
|
|
250
322
|
try:
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
except
|
|
254
|
-
|
|
255
|
-
|
|
323
|
+
resp = requests.get(self.CHECK_IP_URL, proxies=proxies_map, timeout=check_timeout)
|
|
324
|
+
current_ip = resp.json().get("YourFuckingIPAddress")
|
|
325
|
+
except Exception:
|
|
326
|
+
current_ip = None
|
|
327
|
+
|
|
328
|
+
if current_ip and current_ip != baseline:
|
|
329
|
+
self.mark_entry_status(entry, True, last_ip=current_ip)
|
|
330
|
+
logger.info(f"IP changed from {baseline} to {current_ip}")
|
|
331
|
+
return entry
|
|
332
|
+
|
|
333
|
+
time.sleep(interval)
|
|
334
|
+
|
|
335
|
+
raise EnsureNewIPTimeoutError(f"Timed out waiting for new IP after {timeout}s")
|
|
@@ -43,9 +43,7 @@ class TinybirdInterface:
|
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
def __prepare_json_row(self, obj_dict):
|
|
46
|
-
return json.dumps(
|
|
47
|
-
self.__dict_lists_to_string(obj_dict), default=self.__converter
|
|
48
|
-
)
|
|
46
|
+
return json.dumps(self.__dict_lists_to_string(obj_dict), default=self.__converter)
|
|
49
47
|
|
|
50
48
|
@staticmethod
|
|
51
49
|
def __handle_api_response(json_response):
|
|
@@ -53,13 +51,9 @@ class TinybirdInterface:
|
|
|
53
51
|
quarantined_rows = json_response["quarantined_rows"]
|
|
54
52
|
|
|
55
53
|
if quarantined_rows > 0:
|
|
56
|
-
logger.error(
|
|
57
|
-
f"wrong insertion of {quarantined_rows} records to Tinybird API..."
|
|
58
|
-
)
|
|
54
|
+
logger.error(f"wrong insertion of {quarantined_rows} records to Tinybird API...")
|
|
59
55
|
else:
|
|
60
|
-
logger.info(
|
|
61
|
-
f"successfully inserted {successful_rows} records to Tinybird API!"
|
|
62
|
-
)
|
|
56
|
+
logger.info(f"successfully inserted {successful_rows} records to Tinybird API!")
|
|
63
57
|
|
|
64
58
|
return successful_rows, quarantined_rows
|
|
65
59
|
|
|
@@ -72,9 +66,7 @@ class TinybirdInterface:
|
|
|
72
66
|
return self.__insert_data_to_endpoint(self.__prepare_json_row(obj_dict))
|
|
73
67
|
|
|
74
68
|
def insert_batch_to_api(self, batch):
|
|
75
|
-
return self.__insert_data_to_endpoint(
|
|
76
|
-
"\n".join([self.__prepare_json_row(x) for x in batch])
|
|
77
|
-
)
|
|
69
|
+
return self.__insert_data_to_endpoint("\n".join([self.__prepare_json_row(x) for x in batch]))
|
|
78
70
|
|
|
79
71
|
def insert_pandas_df_to_api(self, df):
|
|
80
72
|
return self.__insert_data_to_endpoint(df.to_json(orient="records", lines=True))
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from unidecode import unidecode
|
|
2
1
|
import re
|
|
3
2
|
|
|
3
|
+
from unidecode import unidecode
|
|
4
|
+
|
|
4
5
|
CITY_TO_PROVINCE = {"Madrid": "Madrid"}
|
|
5
6
|
|
|
6
7
|
POSTCODES = {
|
|
@@ -396,31 +397,27 @@ _NORMALIZED_PROVINCE_CACHE = {}
|
|
|
396
397
|
for country, provinces in PROVINCE_TO_POSTCODE.items():
|
|
397
398
|
# Get the original keys (e.g., "A Coruña", "Álava")
|
|
398
399
|
original_keys = list(provinces.keys())
|
|
399
|
-
|
|
400
|
+
|
|
400
401
|
# Create the normalized list (e.g., "a coruna", "alava")
|
|
401
402
|
normalized_choices = [unidecode(p).lower() for p in original_keys]
|
|
402
|
-
|
|
403
|
+
|
|
403
404
|
_NORMALIZED_PROVINCE_CACHE[country] = {
|
|
404
|
-
"choices": normalized_choices,
|
|
405
|
-
"keys": original_keys
|
|
405
|
+
"choices": normalized_choices, # The list for rapidfuzz to search in
|
|
406
|
+
"keys": original_keys, # The list to find the name by index
|
|
406
407
|
}
|
|
407
408
|
|
|
408
409
|
# Source: https://github.com/ariankoochak/regex-patterns-of-all-countries
|
|
409
410
|
COUNTRY_PARSING_RULES = {
|
|
410
411
|
"es": {
|
|
411
412
|
"zip_validate_pattern": re.compile(r"^\d{5}$"),
|
|
412
|
-
|
|
413
|
-
"
|
|
414
|
-
|
|
415
|
-
"phone_validate_pattern": re.compile(r"^(\+?34)?[67]\d{8}$")
|
|
413
|
+
"zip_search_pattern": re.compile(r"\b\d{5}\b"),
|
|
414
|
+
"phone_validate_pattern": re.compile(r"^(\+?34)?[67]\d{8}$"),
|
|
416
415
|
},
|
|
417
416
|
"pt": {
|
|
418
417
|
"zip_validate_pattern": re.compile(r"^\d{4}[- ]{0,1}\d{3}$|^\d{4}$"),
|
|
419
|
-
|
|
420
418
|
"zip_search_pattern": re.compile(r"\b\d{4}[- ]?\d{3}\b|\b\d{4}\b"),
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
}
|
|
419
|
+
"phone_validate_pattern": re.compile(r"^(\+?351)?9[1236]\d{7}$"),
|
|
420
|
+
},
|
|
424
421
|
}
|
|
425
422
|
|
|
426
423
|
# Cutoff score for rapidfuzz in the name standardization function
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .main import * # noqa: F403
|
|
@@ -3,20 +3,23 @@
|
|
|
3
3
|
|
|
4
4
|
import re
|
|
5
5
|
import unicodedata
|
|
6
|
+
|
|
6
7
|
import inflection
|
|
7
8
|
|
|
8
9
|
########################################################################################################################
|
|
9
10
|
# FUNCTIONS
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
def process_task_name(task_id):
|
|
12
|
-
task_id =
|
|
13
|
-
f"_{unicodedata.name(c)}_" if not c.isalnum() else c
|
|
14
|
-
|
|
14
|
+
task_id = "".join(
|
|
15
|
+
f"_{unicodedata.name(c)}_" if not c.isalnum() else c
|
|
16
|
+
for c in task_id
|
|
17
|
+
if c.isalnum() or (unicodedata.category(c) not in ("Cc", "Cf", "Cs", "Co", "Cn"))
|
|
15
18
|
)
|
|
16
|
-
task_id = inflection.parameterize(task_id, separator=
|
|
19
|
+
task_id = inflection.parameterize(task_id, separator="_")
|
|
17
20
|
task_id = task_id.lower()
|
|
18
|
-
task_id = task_id.strip(
|
|
19
|
-
task_id = re.sub(r
|
|
21
|
+
task_id = task_id.strip("_")
|
|
22
|
+
task_id = re.sub(r"_+", "_", task_id)
|
|
20
23
|
if task_id[0].isdigit():
|
|
21
|
-
task_id =
|
|
24
|
+
task_id = "task_" + task_id
|
|
22
25
|
return task_id
|
|
@@ -8,6 +8,7 @@ from sqlalchemy.ext.declarative import declarative_base
|
|
|
8
8
|
|
|
9
9
|
Base = declarative_base()
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
class View(Base):
|
|
12
13
|
__abstract__ = True
|
|
13
14
|
is_view = True
|
|
@@ -19,4 +20,4 @@ class View(Base):
|
|
|
19
20
|
"""
|
|
20
21
|
conn.execute(f"""
|
|
21
22
|
CREATE OR REPLACE VIEW {cls.__tablename__} AS {query}
|
|
22
|
-
""")
|
|
23
|
+
""")
|
|
@@ -3,19 +3,19 @@
|
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
5
|
import configparser
|
|
6
|
-
from datetime import timedelta
|
|
7
6
|
import logging
|
|
8
7
|
import random
|
|
9
8
|
import re
|
|
10
9
|
import shlex
|
|
11
10
|
import subprocess
|
|
12
11
|
import time
|
|
12
|
+
from datetime import timedelta
|
|
13
13
|
from typing import Sequence
|
|
14
|
-
from babel.numbers import parse_decimal
|
|
15
14
|
|
|
16
|
-
from bs4 import BeautifulSoup
|
|
17
15
|
import pendulum
|
|
18
16
|
import requests
|
|
17
|
+
from babel.numbers import parse_decimal
|
|
18
|
+
from bs4 import BeautifulSoup
|
|
19
19
|
from requests.exceptions import ProxyError
|
|
20
20
|
from tenacity import (
|
|
21
21
|
before_sleep_log,
|
|
@@ -29,7 +29,7 @@ from tenacity import (
|
|
|
29
29
|
|
|
30
30
|
from datamarket.exceptions.main import ManagedHTTPError
|
|
31
31
|
|
|
32
|
-
from ..exceptions import
|
|
32
|
+
from ..exceptions import BadRequestError, NotFoundError, RedirectionDetectedError
|
|
33
33
|
from ..interfaces.proxy import ProxyInterface
|
|
34
34
|
|
|
35
35
|
########################################################################################################################
|
|
@@ -2,16 +2,18 @@
|
|
|
2
2
|
# IMPORTS
|
|
3
3
|
|
|
4
4
|
from typing import Literal, Optional
|
|
5
|
+
|
|
5
6
|
from rapidfuzz import fuzz, process
|
|
6
7
|
from unidecode import unidecode
|
|
8
|
+
|
|
7
9
|
from ..params.nominatim import (
|
|
10
|
+
_NORMALIZED_PROVINCE_CACHE,
|
|
11
|
+
COUNTRY_PARSING_RULES,
|
|
8
12
|
POSTCODE_TO_STATES,
|
|
9
13
|
PROVINCE_TO_POSTCODE,
|
|
10
14
|
PROVINCES,
|
|
11
15
|
STANDARD_THRESHOLD,
|
|
12
16
|
STATES,
|
|
13
|
-
_NORMALIZED_PROVINCE_CACHE,
|
|
14
|
-
COUNTRY_PARSING_RULES
|
|
15
17
|
)
|
|
16
18
|
from .strings import normalize
|
|
17
19
|
|
|
@@ -32,9 +34,7 @@ def standardize_admin_division(
|
|
|
32
34
|
return None
|
|
33
35
|
|
|
34
36
|
country_code = country_code.lower()
|
|
35
|
-
mapping = (
|
|
36
|
-
STATES.get(country_code) if level == "state" else PROVINCES.get(country_code)
|
|
37
|
-
)
|
|
37
|
+
mapping = STATES.get(country_code) if level == "state" else PROVINCES.get(country_code)
|
|
38
38
|
|
|
39
39
|
if not mapping: # If country is not standardized, return raw name
|
|
40
40
|
return name
|
|
@@ -103,7 +103,7 @@ def _province_postcode_match(
|
|
|
103
103
|
normalized_address,
|
|
104
104
|
cache["choices"], # <-- Uses pre-computed list
|
|
105
105
|
scorer=fuzz.partial_ratio,
|
|
106
|
-
score_cutoff=100
|
|
106
|
+
score_cutoff=100,
|
|
107
107
|
)
|
|
108
108
|
|
|
109
109
|
if not result:
|
|
@@ -119,11 +119,8 @@ def _province_postcode_match(
|
|
|
119
119
|
province_map = PROVINCE_TO_POSTCODE.get(country_code, {})
|
|
120
120
|
postcode_prefix = province_map[original_province]
|
|
121
121
|
|
|
122
|
-
return (
|
|
123
|
-
|
|
124
|
-
if len(zip_code) == 4
|
|
125
|
-
else zip_code
|
|
126
|
-
)
|
|
122
|
+
return postcode_prefix + zip_code[1:] if len(zip_code) == 4 else zip_code
|
|
123
|
+
|
|
127
124
|
|
|
128
125
|
def _parse_es_zip_code(
|
|
129
126
|
zip_code: str,
|
|
@@ -131,16 +128,16 @@ def _parse_es_zip_code(
|
|
|
131
128
|
opt_address: str | None,
|
|
132
129
|
) -> str:
|
|
133
130
|
"""parse spain zip code"""
|
|
134
|
-
|
|
131
|
+
|
|
135
132
|
# Get the validation regex from params
|
|
136
|
-
validate_regex = COUNTRY_PARSING_RULES[
|
|
137
|
-
|
|
133
|
+
validate_regex = COUNTRY_PARSING_RULES["es"]["zip_validate_pattern"]
|
|
134
|
+
|
|
138
135
|
if validate_regex.match(zip_code):
|
|
139
136
|
return zip_code
|
|
140
137
|
else:
|
|
141
138
|
# Use search regex from params
|
|
142
|
-
pattern = COUNTRY_PARSING_RULES[
|
|
143
|
-
|
|
139
|
+
pattern = COUNTRY_PARSING_RULES["es"]["zip_search_pattern"]
|
|
140
|
+
|
|
144
141
|
match = pattern.search(address)
|
|
145
142
|
if match:
|
|
146
143
|
return match.group()
|
|
@@ -148,7 +145,7 @@ def _parse_es_zip_code(
|
|
|
148
145
|
match = pattern.search(opt_address)
|
|
149
146
|
if match:
|
|
150
147
|
return match.group()
|
|
151
|
-
|
|
148
|
+
|
|
152
149
|
province_match = _province_postcode_match(address, zip_code, country_code="es")
|
|
153
150
|
return province_match or zip_code
|
|
154
151
|
|
|
@@ -161,18 +158,18 @@ def _parse_pt_zip_code(
|
|
|
161
158
|
"""parse portugal zip code"""
|
|
162
159
|
|
|
163
160
|
# Get the validation regex from params
|
|
164
|
-
validate_regex = COUNTRY_PARSING_RULES[
|
|
161
|
+
validate_regex = COUNTRY_PARSING_RULES["pt"]["zip_validate_pattern"]
|
|
165
162
|
|
|
166
163
|
if validate_regex.match(zip_code):
|
|
167
164
|
return zip_code
|
|
168
165
|
else:
|
|
169
166
|
# Use search regex from params
|
|
170
|
-
pattern = COUNTRY_PARSING_RULES[
|
|
171
|
-
|
|
167
|
+
pattern = COUNTRY_PARSING_RULES["pt"]["zip_search_pattern"]
|
|
168
|
+
|
|
172
169
|
match = pattern.search(address)
|
|
173
170
|
if match is None and opt_address:
|
|
174
171
|
match = pattern.search(opt_address)
|
|
175
|
-
|
|
172
|
+
|
|
176
173
|
return match.group() if match else zip_code
|
|
177
174
|
|
|
178
175
|
|
|
@@ -13,8 +13,12 @@ from camoufox.async_api import AsyncCamoufox as Camoufox
|
|
|
13
13
|
from playwright.async_api import (
|
|
14
14
|
Browser,
|
|
15
15
|
BrowserContext,
|
|
16
|
-
Error as PlaywrightError,
|
|
17
16
|
Page,
|
|
17
|
+
)
|
|
18
|
+
from playwright.async_api import (
|
|
19
|
+
Error as PlaywrightError,
|
|
20
|
+
)
|
|
21
|
+
from playwright.async_api import (
|
|
18
22
|
TimeoutError as PlaywrightTimeoutError,
|
|
19
23
|
)
|
|
20
24
|
from tenacity import (
|
|
@@ -27,7 +31,6 @@ from tenacity import (
|
|
|
27
31
|
|
|
28
32
|
from datamarket.interfaces.proxy import ProxyInterface
|
|
29
33
|
|
|
30
|
-
|
|
31
34
|
########################################################################################################################
|
|
32
35
|
# SETUP LOGGER
|
|
33
36
|
|
|
@@ -12,8 +12,12 @@ from camoufox import Camoufox
|
|
|
12
12
|
from playwright.sync_api import (
|
|
13
13
|
Browser,
|
|
14
14
|
BrowserContext,
|
|
15
|
-
Error as PlaywrightError,
|
|
16
15
|
Page,
|
|
16
|
+
)
|
|
17
|
+
from playwright.sync_api import (
|
|
18
|
+
Error as PlaywrightError,
|
|
19
|
+
)
|
|
20
|
+
from playwright.sync_api import (
|
|
17
21
|
TimeoutError as PlaywrightTimeoutError,
|
|
18
22
|
)
|
|
19
23
|
from tenacity import (
|
|
@@ -23,6 +27,7 @@ from tenacity import (
|
|
|
23
27
|
stop_after_delay,
|
|
24
28
|
wait_exponential,
|
|
25
29
|
)
|
|
30
|
+
|
|
26
31
|
from datamarket.interfaces.proxy import ProxyInterface
|
|
27
32
|
|
|
28
33
|
########################################################################################################################
|
|
@@ -171,4 +176,4 @@ class PlaywrightCrawler:
|
|
|
171
176
|
if not self.page:
|
|
172
177
|
logger.info("Browser context not found, initializing now...")
|
|
173
178
|
self.init_context()
|
|
174
|
-
return self._goto_with_retry(url)
|
|
179
|
+
return self._goto_with_retry(url)
|
|
@@ -17,9 +17,7 @@ logger = logging.getLogger(__name__)
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def get_chromedriver_version():
|
|
20
|
-
return int(
|
|
21
|
-
run_bash_command("/usr/bin/google-chrome --version").split(" ")[2].split(".")[0]
|
|
22
|
-
)
|
|
20
|
+
return int(run_bash_command("/usr/bin/google-chrome --version").split(" ")[2].split(".")[0])
|
|
23
21
|
|
|
24
22
|
|
|
25
23
|
def get_driver(chrome_options=None, **kwargs):
|
|
@@ -38,23 +36,19 @@ def get_driver(chrome_options=None, **kwargs):
|
|
|
38
36
|
|
|
39
37
|
def wait(driver, css_selector, timeout=30):
|
|
40
38
|
logger.info(f"waiting for {css_selector}...")
|
|
41
|
-
return WebDriverWait(driver, timeout).until(
|
|
42
|
-
EC.visibility_of_element_located(("css selector", css_selector))
|
|
43
|
-
)
|
|
39
|
+
return WebDriverWait(driver, timeout).until(EC.visibility_of_element_located(("css selector", css_selector)))
|
|
44
40
|
|
|
45
41
|
|
|
46
42
|
def wait_and_click(driver, css_selector, timeout=30):
|
|
47
43
|
logger.info(f"clicking on {css_selector}...")
|
|
48
|
-
WebDriverWait(driver, timeout).until(
|
|
49
|
-
EC.element_to_be_clickable(("css selector", css_selector))
|
|
50
|
-
).click()
|
|
44
|
+
WebDriverWait(driver, timeout).until(EC.element_to_be_clickable(("css selector", css_selector))).click()
|
|
51
45
|
|
|
52
46
|
|
|
53
47
|
def wait_and_fill(driver, css_selector, text_to_fill, timeout=30):
|
|
54
48
|
logger.info(f"sending text to {css_selector}...")
|
|
55
|
-
WebDriverWait(driver, timeout).until(
|
|
56
|
-
|
|
57
|
-
)
|
|
49
|
+
WebDriverWait(driver, timeout).until(EC.presence_of_element_located(("css selector", css_selector))).send_keys(
|
|
50
|
+
text_to_fill
|
|
51
|
+
)
|
|
58
52
|
|
|
59
53
|
|
|
60
54
|
def scroll(driver, css_selector):
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
########################################################################################################################
|
|
2
2
|
# IMPORTS
|
|
3
|
-
import re
|
|
4
3
|
import unicodedata
|
|
5
4
|
from enum import Enum, auto
|
|
6
5
|
from typing import Any, Optional, Set, Union
|
|
6
|
+
|
|
7
7
|
import numpy as np
|
|
8
8
|
from inflection import camelize, parameterize, titleize, underscore
|
|
9
9
|
from string_utils import prettify, strip_html
|
|
@@ -36,9 +36,7 @@ class NamingConvention(Enum):
|
|
|
36
36
|
# FUNCTIONS
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
def get_unidecoded_text(
|
|
40
|
-
input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False
|
|
41
|
-
) -> str:
|
|
39
|
+
def get_unidecoded_text(input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False) -> str:
|
|
42
40
|
"""
|
|
43
41
|
Processes a string by unidecoding characters, optionally lowercasing them,
|
|
44
42
|
while preserving a specified set of allowed characters.
|
|
@@ -65,9 +63,7 @@ def get_unidecoded_text(
|
|
|
65
63
|
return "".join(chars_list)
|
|
66
64
|
|
|
67
65
|
|
|
68
|
-
def transliterate_symbols(
|
|
69
|
-
s: str, allowed_symbols_set: Optional[Set[str]] = None
|
|
70
|
-
) -> str:
|
|
66
|
+
def transliterate_symbols(s: str, allowed_symbols_set: Optional[Set[str]] = None) -> str:
|
|
71
67
|
"""
|
|
72
68
|
Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
|
|
73
69
|
with spaces replaced by underscores. Other characters, or characters in allowed_symbols_set, remain unchanged.
|
|
@@ -182,9 +178,7 @@ def normalize(
|
|
|
182
178
|
|
|
183
179
|
for c in intermediate_text:
|
|
184
180
|
cat = unicodedata.category(c)
|
|
185
|
-
if (
|
|
186
|
-
c in _allowed_symbols_set or c.isalnum()
|
|
187
|
-
): # Allowed symbols are part of tokens
|
|
181
|
+
if c in _allowed_symbols_set or c.isalnum(): # Allowed symbols are part of tokens
|
|
188
182
|
current_token_chars.append(c)
|
|
189
183
|
elif mode is NormalizationMode.FULL and cat.startswith("S"):
|
|
190
184
|
# Transliterate S* category symbols not in allowed_symbols
|
|
@@ -220,4 +214,4 @@ def normalize(
|
|
|
220
214
|
if naming is NamingConvention.PASCAL:
|
|
221
215
|
return camelize(underscored)
|
|
222
216
|
|
|
223
|
-
return underscored
|
|
217
|
+
return underscored
|
|
@@ -3,11 +3,13 @@
|
|
|
3
3
|
|
|
4
4
|
import re
|
|
5
5
|
from typing import Literal
|
|
6
|
+
|
|
6
7
|
from ...params.nominatim import COUNTRY_PARSING_RULES
|
|
7
8
|
|
|
8
9
|
########################################################################################################################
|
|
9
10
|
# FUNCTIONS
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str | None:
|
|
12
14
|
"""Clean and standardize phone number from a certain country_code
|
|
13
15
|
|
|
@@ -27,12 +29,12 @@ def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str |
|
|
|
27
29
|
pattern = COUNTRY_PARSING_RULES[country_code]["phone_validate_pattern"]
|
|
28
30
|
|
|
29
31
|
# Validate and extract in one step
|
|
30
|
-
if len(clean_number) >= 9:
|
|
32
|
+
if len(clean_number) >= 9: # Check if the cleaned number has at least 9 digits
|
|
31
33
|
match = pattern.match(clean_number)
|
|
32
34
|
|
|
33
35
|
# Return the captured group (the 9-digit number)
|
|
34
36
|
return match.group(0)[-9:] if match else None
|
|
35
37
|
else:
|
|
36
|
-
return None
|
|
38
|
+
return None # Or handle the case where the number is too short
|
|
37
39
|
else:
|
|
38
40
|
raise ValueError(f"Country code ({country_code}) is not currently supported")
|
|
@@ -9,6 +9,7 @@ from typing_extensions import Annotated
|
|
|
9
9
|
########################################################################################################################
|
|
10
10
|
# TYPES
|
|
11
11
|
|
|
12
|
+
|
|
12
13
|
class Dict(dict):
|
|
13
14
|
def __init__(self, value: str):
|
|
14
15
|
super().__init__(json.loads(value))
|
|
@@ -25,4 +26,4 @@ def parse_json_dict(value: str) -> Dict:
|
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
DictArg = Annotated[Dict, typer.Argument(parser=parse_json_dict)]
|
|
28
|
-
DictOpt = Annotated[Dict, typer.Option(parser=parse_json_dict)]
|
|
29
|
+
DictOpt = Annotated[Dict, typer.Option(parser=parse_json_dict)]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from .main import * # noqa: F403
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamarket-0.7.103/src/datamarket/utils → datamarket-0.7.104/src/datamarket/exceptions}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|