datamarket 0.7.89__py3-none-any.whl → 0.7.125__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamarket/exceptions/__init__.py +1 -1
- datamarket/exceptions/main.py +108 -4
- datamarket/interfaces/alchemy.py +30 -5
- datamarket/interfaces/aws.py +25 -3
- datamarket/interfaces/azure.py +18 -18
- datamarket/interfaces/ftp.py +3 -11
- datamarket/interfaces/nominatim.py +39 -28
- datamarket/interfaces/proxy.py +320 -74
- datamarket/interfaces/tinybird.py +4 -12
- datamarket/params/nominatim.py +434 -19
- datamarket/utils/airflow.py +10 -7
- datamarket/utils/alchemy.py +2 -1
- datamarket/utils/main.py +115 -67
- datamarket/utils/nominatim.py +201 -0
- datamarket/utils/playwright/async_api.py +150 -28
- datamarket/utils/playwright/sync_api.py +159 -27
- datamarket/utils/requests.py +653 -0
- datamarket/utils/selenium.py +6 -12
- datamarket/utils/strings/normalization.py +0 -1
- datamarket/utils/strings/standardization.py +40 -0
- datamarket/utils/typer.py +2 -1
- {datamarket-0.7.89.dist-info → datamarket-0.7.125.dist-info}/METADATA +11 -10
- datamarket-0.7.125.dist-info/RECORD +36 -0
- {datamarket-0.7.89.dist-info → datamarket-0.7.125.dist-info}/WHEEL +1 -1
- datamarket-0.7.89.dist-info/RECORD +0 -33
- {datamarket-0.7.89.dist-info/licenses → datamarket-0.7.125.dist-info}/LICENSE +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
from .main import * # noqa: F403
|
|
1
|
+
from .main import * # noqa: F403
|
datamarket/exceptions/main.py
CHANGED
|
@@ -2,13 +2,117 @@
|
|
|
2
2
|
# CLASSES
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from requests import Request, Response
|
|
8
|
+
from requests.exceptions import HTTPError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ManagedHTTPError(HTTPError):
|
|
12
|
+
"""Signal that this HTTP status was handled and should not be retried."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
message: Optional[str] = None,
|
|
17
|
+
response: Optional[Response] = None,
|
|
18
|
+
request: Optional[Request] = None,
|
|
19
|
+
*args,
|
|
20
|
+
**kwargs,
|
|
21
|
+
):
|
|
22
|
+
self.response = response
|
|
23
|
+
self.request = request or getattr(response, "request", None)
|
|
24
|
+
|
|
25
|
+
# Build a safe default message
|
|
26
|
+
if not message:
|
|
27
|
+
status = getattr(self.response, "status_code", "unknown")
|
|
28
|
+
url = getattr(self.request, "url", "unknown")
|
|
29
|
+
message = f"HTTP {status} for {url}"
|
|
30
|
+
|
|
31
|
+
self.message = message
|
|
32
|
+
|
|
33
|
+
super().__init__(message, *args, response=response, **kwargs)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class IgnoredHTTPError(ManagedHTTPError):
|
|
37
|
+
"""Exception type that signals the error should be ignored by retry logic."""
|
|
38
|
+
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class NotFoundError(ManagedHTTPError):
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
message: Optional[str] = None,
|
|
46
|
+
response: Optional[Response] = None,
|
|
47
|
+
request: Optional[Request] = None,
|
|
48
|
+
*args,
|
|
49
|
+
**kwargs,
|
|
50
|
+
):
|
|
51
|
+
if not message:
|
|
52
|
+
status = getattr(response, "status_code", 404)
|
|
53
|
+
req = request or getattr(response, "request", None)
|
|
54
|
+
url = getattr(req, "url", "unknown")
|
|
55
|
+
message = f"HTTP {status} for {url}"
|
|
56
|
+
super().__init__(message, response, request, *args, **kwargs)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class BadRequestError(ManagedHTTPError):
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
message: Optional[str] = None,
|
|
63
|
+
response: Optional[Response] = None,
|
|
64
|
+
request: Optional[Request] = None,
|
|
65
|
+
*args,
|
|
66
|
+
**kwargs,
|
|
67
|
+
):
|
|
68
|
+
if not message:
|
|
69
|
+
status = getattr(response, "status_code", 400)
|
|
70
|
+
req = request or getattr(response, "request", None)
|
|
71
|
+
url = getattr(req, "url", "unknown")
|
|
72
|
+
message = f"HTTP {status} for {url}"
|
|
73
|
+
super().__init__(message, response, request, *args, **kwargs)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class EmptyResponseError(ManagedHTTPError):
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
message: Optional[str] = None,
|
|
80
|
+
response: Optional[Response] = None,
|
|
81
|
+
request: Optional[Request] = None,
|
|
82
|
+
*args,
|
|
83
|
+
**kwargs,
|
|
84
|
+
):
|
|
85
|
+
if not message:
|
|
86
|
+
req = request or getattr(response, "request", None)
|
|
87
|
+
url = getattr(req, "url", "unknown")
|
|
88
|
+
message = f"Empty response for {url}"
|
|
89
|
+
super().__init__(message, response, request, *args, **kwargs)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class RedirectionDetectedError(ManagedHTTPError):
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
message: Optional[str] = None,
|
|
96
|
+
response: Optional[Response] = None,
|
|
97
|
+
request: Optional[Request] = None,
|
|
98
|
+
*args,
|
|
99
|
+
**kwargs,
|
|
100
|
+
):
|
|
101
|
+
if not message:
|
|
102
|
+
status = getattr(response, "status_code", 300)
|
|
103
|
+
req = request or getattr(response, "request", None)
|
|
104
|
+
url = getattr(req, "url", "unknown")
|
|
105
|
+
message = f"HTTP {status} for {url}"
|
|
106
|
+
super().__init__(message, response, request, *args, **kwargs)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class NoWorkingProxiesError(Exception):
|
|
110
|
+
def __init__(self, message="No working proxies available"):
|
|
7
111
|
self.message = message
|
|
8
112
|
super().__init__(self.message)
|
|
9
113
|
|
|
10
114
|
|
|
11
|
-
class
|
|
12
|
-
def __init__(self, message="
|
|
115
|
+
class EnsureNewIPTimeoutError(Exception):
|
|
116
|
+
def __init__(self, message="Timed out waiting for new IP"):
|
|
13
117
|
self.message = message
|
|
14
118
|
super().__init__(self.message)
|
datamarket/interfaces/alchemy.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import logging
|
|
5
5
|
from collections.abc import MutableMapping
|
|
6
|
+
from enum import Enum, auto
|
|
6
7
|
from typing import Any, Iterator, List, Optional, Type, TypeVar, Union
|
|
7
8
|
from urllib.parse import quote_plus
|
|
8
9
|
|
|
@@ -12,7 +13,6 @@ from sqlalchemy.exc import IntegrityError
|
|
|
12
13
|
from sqlalchemy.ext.declarative import DeclarativeMeta
|
|
13
14
|
from sqlalchemy.orm import Session, sessionmaker
|
|
14
15
|
from sqlalchemy.sql.expression import ClauseElement
|
|
15
|
-
from enum import Enum, auto
|
|
16
16
|
|
|
17
17
|
########################################################################################################################
|
|
18
18
|
# CLASSES
|
|
@@ -195,6 +195,31 @@ class AlchemyInterface:
|
|
|
195
195
|
|
|
196
196
|
query_results.update({column_name: default_value}, synchronize_session=False)
|
|
197
197
|
|
|
198
|
+
@staticmethod
|
|
199
|
+
def _log_integrity_error(ex: IntegrityError, alchemy_obj, action="insert"):
|
|
200
|
+
"""
|
|
201
|
+
Compact, readable IntegrityError logger using SQLSTATE codes.
|
|
202
|
+
Consult https://www.postgresql.org/docs/current/errcodes-appendix.html for details.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
PG_ERROR_LABELS = {
|
|
206
|
+
"23000": "Integrity constraint violation",
|
|
207
|
+
"23001": "Restrict violation",
|
|
208
|
+
"23502": "NOT NULL violation",
|
|
209
|
+
"23503": "Foreign key violation",
|
|
210
|
+
"23505": "Unique violation",
|
|
211
|
+
"23514": "Check constraint violation",
|
|
212
|
+
"23P01": "Exclusion constraint violation",
|
|
213
|
+
}
|
|
214
|
+
code = getattr(ex.orig, "pgcode", None)
|
|
215
|
+
label = PG_ERROR_LABELS.get(code, "Integrity error (unspecified)")
|
|
216
|
+
|
|
217
|
+
# Log one clean message with trace + the raw DB message separately
|
|
218
|
+
if code == "23505": # A simple info log for unique violations
|
|
219
|
+
logger.info(f"{label} trying to {action} {alchemy_obj}")
|
|
220
|
+
else:
|
|
221
|
+
logger.error(f"{label} trying to {action} {alchemy_obj}\nPostgreSQL message: {ex.orig}")
|
|
222
|
+
|
|
198
223
|
def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> bool:
|
|
199
224
|
if self.session is None:
|
|
200
225
|
raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
|
|
@@ -205,10 +230,10 @@ class AlchemyInterface:
|
|
|
205
230
|
if not silent:
|
|
206
231
|
logger.info(f"adding {alchemy_obj}...")
|
|
207
232
|
self.session.add(alchemy_obj)
|
|
208
|
-
except IntegrityError:
|
|
233
|
+
except IntegrityError as ex:
|
|
209
234
|
# Rollback is handled automatically by begin_nested() context manager on error
|
|
210
235
|
if not silent:
|
|
211
|
-
|
|
236
|
+
self._log_integrity_error(ex, alchemy_obj, action="insert")
|
|
212
237
|
# Do not re-raise, allow outer transaction/loop to continue
|
|
213
238
|
return False
|
|
214
239
|
|
|
@@ -264,10 +289,10 @@ class AlchemyInterface:
|
|
|
264
289
|
# Use a savepoint (nested transaction)
|
|
265
290
|
with self.session.begin_nested():
|
|
266
291
|
self.session.execute(statement)
|
|
267
|
-
except IntegrityError:
|
|
292
|
+
except IntegrityError as ex:
|
|
268
293
|
# Rollback is handled automatically by begin_nested() context manager on error
|
|
269
294
|
if not silent:
|
|
270
|
-
|
|
295
|
+
self._log_integrity_error(ex, alchemy_obj, action="upsert")
|
|
271
296
|
# Do not re-raise, allow outer transaction/loop to continue
|
|
272
297
|
return False
|
|
273
298
|
|
datamarket/interfaces/aws.py
CHANGED
|
@@ -3,8 +3,9 @@
|
|
|
3
3
|
|
|
4
4
|
import io
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
6
8
|
import boto3
|
|
7
|
-
from typing import Optional, List, Dict, Any
|
|
8
9
|
|
|
9
10
|
########################################################################################################################
|
|
10
11
|
# CLASSES
|
|
@@ -82,6 +83,14 @@ class AWSInterface:
|
|
|
82
83
|
return
|
|
83
84
|
logger.warning(f"Profile {profile_name} not found")
|
|
84
85
|
|
|
86
|
+
def get_bucket_url(self) -> Optional[str]:
|
|
87
|
+
"""Return active bucket URL."""
|
|
88
|
+
if not self.bucket:
|
|
89
|
+
logger.warning("No active bucket selected")
|
|
90
|
+
return None
|
|
91
|
+
region = self.s3_client.meta.region_name
|
|
92
|
+
return f"https://{self.bucket}.s3.{region}.amazonaws.com"
|
|
93
|
+
|
|
85
94
|
def get_file(self, s3_path: str):
|
|
86
95
|
if not self.bucket:
|
|
87
96
|
logger.warning("No active bucket selected")
|
|
@@ -92,14 +101,27 @@ class AWSInterface:
|
|
|
92
101
|
logger.info(f"{s3_path} does not exist")
|
|
93
102
|
return None
|
|
94
103
|
|
|
104
|
+
def file_exists(self, s3_path: str) -> bool:
|
|
105
|
+
if not self.bucket:
|
|
106
|
+
logger.warning("No active bucket selected")
|
|
107
|
+
return False
|
|
108
|
+
try:
|
|
109
|
+
self.s3_client.head_object(Bucket=self.bucket, Key=s3_path)
|
|
110
|
+
return True
|
|
111
|
+
except self.s3_client.exceptions.NoSuchKey:
|
|
112
|
+
return False
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f"Error checking existence of {s3_path}: {e}")
|
|
115
|
+
raise
|
|
116
|
+
|
|
95
117
|
def read_file_as_bytes(self, s3_path: str) -> Optional[io.BytesIO]:
|
|
96
118
|
obj = self.get_file(s3_path)
|
|
97
119
|
if not obj:
|
|
98
120
|
return None
|
|
99
121
|
return io.BytesIO(obj["Body"].read())
|
|
100
122
|
|
|
101
|
-
def upload_file(self, local_path: str, s3_path: str) -> None:
|
|
123
|
+
def upload_file(self, local_path: str, s3_path: str, **kwargs) -> None:
|
|
102
124
|
if not self.bucket:
|
|
103
125
|
logger.warning("No active bucket selected")
|
|
104
126
|
return
|
|
105
|
-
self.s3.Bucket(self.bucket).upload_file(local_path, s3_path)
|
|
127
|
+
self.s3.Bucket(self.bucket).upload_file(local_path, s3_path, **kwargs)
|
datamarket/interfaces/azure.py
CHANGED
|
@@ -5,7 +5,7 @@ import logging
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Any, Dict, List, Optional
|
|
7
7
|
|
|
8
|
-
from azure.storage.blob import BlobServiceClient
|
|
8
|
+
from azure.storage.blob import BlobServiceClient, ContainerClient
|
|
9
9
|
from pendulum import now
|
|
10
10
|
|
|
11
11
|
########################################################################################################################
|
|
@@ -24,22 +24,26 @@ class AzureBlobInterface:
|
|
|
24
24
|
profile_name = section.split(":", 1)[1]
|
|
25
25
|
connection_string = self.config[section].get("connection_string")
|
|
26
26
|
container_name = self.config[section].get("container_name")
|
|
27
|
+
sas_container_url = self.config[section].get("sas_container_url")
|
|
28
|
+
|
|
29
|
+
if sas_container_url:
|
|
30
|
+
session = ContainerClient.from_container_url(sas_container_url)
|
|
31
|
+
elif connection_string and container_name:
|
|
32
|
+
session = BlobServiceClient.from_connection_string(connection_string).get_container_client(
|
|
33
|
+
container_name
|
|
34
|
+
)
|
|
27
35
|
|
|
28
36
|
self.profiles.append(
|
|
29
37
|
{
|
|
30
38
|
"profile": profile_name,
|
|
31
39
|
"container_name": container_name,
|
|
32
|
-
"session":
|
|
33
|
-
connection_string
|
|
34
|
-
).get_container_client(container_name),
|
|
40
|
+
"session": session,
|
|
35
41
|
}
|
|
36
42
|
)
|
|
37
43
|
|
|
38
44
|
if not self.profiles:
|
|
39
45
|
logger.warning("No Azure profiles found in config file")
|
|
40
|
-
self.current_profile: Optional[Dict[str, Any]] =
|
|
41
|
-
self.profiles[0] if self.profiles else None
|
|
42
|
-
)
|
|
46
|
+
self.current_profile: Optional[Dict[str, Any]] = self.profiles[0] if self.profiles else None
|
|
43
47
|
|
|
44
48
|
def switch_profile(self, profile_name: str) -> None:
|
|
45
49
|
for profile in self.profiles:
|
|
@@ -54,7 +58,7 @@ class AzureBlobInterface:
|
|
|
54
58
|
remote_folder,
|
|
55
59
|
remote_file=None,
|
|
56
60
|
upload_file_info=False,
|
|
57
|
-
**
|
|
61
|
+
**file_info_data,
|
|
58
62
|
):
|
|
59
63
|
if not remote_file:
|
|
60
64
|
remote_file = Path(local_file).name
|
|
@@ -66,16 +70,16 @@ class AzureBlobInterface:
|
|
|
66
70
|
blob_client.upload_blob(data, overwrite=True)
|
|
67
71
|
|
|
68
72
|
if upload_file_info:
|
|
69
|
-
self.upload_file_info(remote_path, **
|
|
73
|
+
self.upload_file_info(remote_path, **file_info_data)
|
|
70
74
|
|
|
71
|
-
def upload_file_info(self, remote_path, **
|
|
75
|
+
def upload_file_info(self, remote_path, **file_info_data):
|
|
72
76
|
summary_file = remote_path.split(".")[0] + "_resumen.csv"
|
|
73
77
|
blob_client = self.current_profile["session"].get_blob_client(summary_file)
|
|
74
78
|
|
|
75
79
|
new_record = {
|
|
76
80
|
"file": remote_path,
|
|
77
|
-
"num_rows":
|
|
78
|
-
"schema_version":
|
|
81
|
+
"num_rows": file_info_data.get("num_rows"),
|
|
82
|
+
"schema_version": file_info_data.get("schema_version"),
|
|
79
83
|
"upload_date": now(tz="Europe/Madrid").to_datetime_string(),
|
|
80
84
|
}
|
|
81
85
|
|
|
@@ -109,14 +113,10 @@ class AzureBlobInterface:
|
|
|
109
113
|
if blob_client.exists():
|
|
110
114
|
properties = blob_client.get_blob_properties()
|
|
111
115
|
if properties.size > 100: # Check if size is greater than 100 bytes
|
|
112
|
-
logger.debug(
|
|
113
|
-
f"Blob '{remote_path}' exists and is not empty (size: {properties.size})."
|
|
114
|
-
)
|
|
116
|
+
logger.debug(f"Blob '{remote_path}' exists and is not empty (size: {properties.size}).")
|
|
115
117
|
return True
|
|
116
118
|
else:
|
|
117
|
-
logger.debug(
|
|
118
|
-
f"Blob '{remote_path}' exists but size ({properties.size}) is not > 100 bytes."
|
|
119
|
-
)
|
|
119
|
+
logger.debug(f"Blob '{remote_path}' exists but size ({properties.size}) is not > 100 bytes.")
|
|
120
120
|
return False
|
|
121
121
|
else:
|
|
122
122
|
logger.debug(f"Blob '{remote_path}' does not exist.")
|
datamarket/interfaces/ftp.py
CHANGED
|
@@ -20,22 +20,14 @@ class FTPInterface:
|
|
|
20
20
|
if section.startswith("ftp:"):
|
|
21
21
|
profile_name = section.split(":", 1)[1]
|
|
22
22
|
ftps = self.config[section]["ftps"].lower() == "true"
|
|
23
|
-
ftp_conn = (
|
|
24
|
-
|
|
25
|
-
if ftps
|
|
26
|
-
else FTP(self.config[section]["server"])
|
|
27
|
-
) # noqa: S321
|
|
28
|
-
ftp_conn.login(
|
|
29
|
-
self.config[section]["username"], self.config[section]["password"]
|
|
30
|
-
)
|
|
23
|
+
ftp_conn = FTP_TLS(self.config[section]["server"]) if ftps else FTP(self.config[section]["server"]) # noqa: S321
|
|
24
|
+
ftp_conn.login(self.config[section]["username"], self.config[section]["password"])
|
|
31
25
|
self.profiles.append({"profile": profile_name, "session": ftp_conn})
|
|
32
26
|
|
|
33
27
|
if not self.profiles:
|
|
34
28
|
logger.warning("no ftp section in config")
|
|
35
29
|
|
|
36
|
-
self.current_profile: Optional[Dict[str, Any]] =
|
|
37
|
-
self.profiles[0] if self.profiles else None
|
|
38
|
-
)
|
|
30
|
+
self.current_profile: Optional[Dict[str, Any]] = self.profiles[0] if self.profiles else None
|
|
39
31
|
self.ftp = self.current_profile["session"] if self.current_profile else None
|
|
40
32
|
|
|
41
33
|
def switch_profile(self, profile_name: str) -> None:
|
|
@@ -10,7 +10,14 @@ import requests
|
|
|
10
10
|
from geopy.distance import geodesic
|
|
11
11
|
from jellyfish import jaro_winkler_similarity
|
|
12
12
|
|
|
13
|
-
from ..params.nominatim import
|
|
13
|
+
from ..params.nominatim import (
|
|
14
|
+
CITY_TO_PROVINCE,
|
|
15
|
+
MADRID_DISTRICT_DIRECT_PATCH,
|
|
16
|
+
MADRID_DISTRICT_QUARTER_PATCH,
|
|
17
|
+
MADRID_QUARTER_DIRECT_PATCH,
|
|
18
|
+
POSTCODES,
|
|
19
|
+
)
|
|
20
|
+
from ..utils.nominatim import standardize_admin_division
|
|
14
21
|
from ..utils.strings import normalize
|
|
15
22
|
|
|
16
23
|
########################################################################################################################
|
|
@@ -141,24 +148,6 @@ class Nominatim:
|
|
|
141
148
|
"number": None,
|
|
142
149
|
}
|
|
143
150
|
|
|
144
|
-
@staticmethod
|
|
145
|
-
def _canonicalize_state(state: Optional[str]) -> Optional[str]:
|
|
146
|
-
"""
|
|
147
|
-
Canonicalize the state name using similarity. The most similar canonical state name is
|
|
148
|
-
returned if the similarity score is above the threshold.
|
|
149
|
-
"""
|
|
150
|
-
if not state:
|
|
151
|
-
return None
|
|
152
|
-
norm_state = normalize(state)
|
|
153
|
-
best_match = None
|
|
154
|
-
best_score = 0.0
|
|
155
|
-
for canonical in STATES:
|
|
156
|
-
score = jaro_winkler_similarity(norm_state, normalize(canonical))
|
|
157
|
-
if score > best_score:
|
|
158
|
-
best_score = score
|
|
159
|
-
best_match = canonical
|
|
160
|
-
return best_match if best_score > JARO_WINKLER_THRESHOLD else None
|
|
161
|
-
|
|
162
151
|
def _select_postcode_and_derived_province(
|
|
163
152
|
self,
|
|
164
153
|
parsed_nominatim_result: Dict[str, Optional[str]],
|
|
@@ -214,7 +203,7 @@ class Nominatim:
|
|
|
214
203
|
)
|
|
215
204
|
|
|
216
205
|
# 2) If no raw province, compare with province from Nominatim PC **only when** Nominatim is close
|
|
217
|
-
if not geonames_pc_valid and not norm_raw_province and nominatim_is_close:
|
|
206
|
+
if not geonames_pc_valid and not norm_raw_province and nominatim_is_close: # noqa: SIM102
|
|
218
207
|
if norm_province_from_geonames_pc and norm_province_from_nominatim_pc:
|
|
219
208
|
geonames_pc_valid = (
|
|
220
209
|
jaro_winkler_similarity(norm_province_from_geonames_pc, norm_province_from_nominatim_pc)
|
|
@@ -243,9 +232,6 @@ class Nominatim:
|
|
|
243
232
|
if not state and nominatim_pc_valid:
|
|
244
233
|
state = parsed_nominatim_result.get("state")
|
|
245
234
|
|
|
246
|
-
# Canonicalize
|
|
247
|
-
state = self._canonicalize_state(state)
|
|
248
|
-
|
|
249
235
|
return postcode, province, state
|
|
250
236
|
|
|
251
237
|
def _select_final_result(
|
|
@@ -293,14 +279,32 @@ class Nominatim:
|
|
|
293
279
|
|
|
294
280
|
return final_result
|
|
295
281
|
|
|
282
|
+
@staticmethod
|
|
283
|
+
def _patch_district(raw_district: str, raw_quarter: str = None):
|
|
284
|
+
"""
|
|
285
|
+
Patches the district name, optionally using the quarter for specific patches.
|
|
286
|
+
"""
|
|
287
|
+
if raw_quarter:
|
|
288
|
+
# If raw_quarter is provided, use the tuple (district, quarter) as the key.
|
|
289
|
+
key = (raw_district, raw_quarter)
|
|
290
|
+
return MADRID_DISTRICT_QUARTER_PATCH.get(key, raw_district)
|
|
291
|
+
else:
|
|
292
|
+
return MADRID_DISTRICT_DIRECT_PATCH.get(raw_district, raw_district)
|
|
293
|
+
|
|
294
|
+
@staticmethod
|
|
295
|
+
def _patch_quarter(raw_quarter: str):
|
|
296
|
+
"""
|
|
297
|
+
Patches the quarter name directly.
|
|
298
|
+
"""
|
|
299
|
+
return MADRID_QUARTER_DIRECT_PATCH.get(raw_quarter, raw_quarter)
|
|
300
|
+
|
|
296
301
|
def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
|
297
302
|
district = self._get_attribute(raw_json, ["city_district", "suburb", "borough"])
|
|
298
303
|
quarter = self._get_attribute(raw_json, ["quarter", "neighbourhood"])
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
+
if (city := raw_json.get("city")) and city == "Madrid":
|
|
305
|
+
mid_district = self._patch_district(district)
|
|
306
|
+
quarter = self._patch_quarter(quarter)
|
|
307
|
+
district = self._patch_district(mid_district, quarter)
|
|
304
308
|
return district, quarter
|
|
305
309
|
|
|
306
310
|
def geocode(self, address: str) -> List[Dict[str, Any]]:
|
|
@@ -356,6 +360,13 @@ class Nominatim:
|
|
|
356
360
|
selected_state,
|
|
357
361
|
)
|
|
358
362
|
|
|
363
|
+
# Standardize
|
|
364
|
+
final_result["province"] = standardize_admin_division(
|
|
365
|
+
name=final_result["province"], level="province", country_code=final_result["country_code"]
|
|
366
|
+
)
|
|
367
|
+
final_result["state"] = standardize_admin_division(
|
|
368
|
+
name=final_result["state"], level="state", country_code=final_result["country_code"]
|
|
369
|
+
)
|
|
359
370
|
return final_result
|
|
360
371
|
|
|
361
372
|
|