datamarket 0.7.89__py3-none-any.whl → 0.7.125__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- from .main import * # noqa: F403
1
+ from .main import * # noqa: F403
@@ -2,13 +2,117 @@
2
2
  # CLASSES
3
3
 
4
4
 
5
- class RedirectionDetectedError(Exception):
6
- def __init__(self, message="Redirection detected!"):
5
+ from typing import Optional
6
+
7
+ from requests import Request, Response
8
+ from requests.exceptions import HTTPError
9
+
10
+
11
+ class ManagedHTTPError(HTTPError):
12
+ """Signal that this HTTP status was handled and should not be retried."""
13
+
14
+ def __init__(
15
+ self,
16
+ message: Optional[str] = None,
17
+ response: Optional[Response] = None,
18
+ request: Optional[Request] = None,
19
+ *args,
20
+ **kwargs,
21
+ ):
22
+ self.response = response
23
+ self.request = request or getattr(response, "request", None)
24
+
25
+ # Build a safe default message
26
+ if not message:
27
+ status = getattr(self.response, "status_code", "unknown")
28
+ url = getattr(self.request, "url", "unknown")
29
+ message = f"HTTP {status} for {url}"
30
+
31
+ self.message = message
32
+
33
+ super().__init__(message, *args, response=response, **kwargs)
34
+
35
+
36
+ class IgnoredHTTPError(ManagedHTTPError):
37
+ """Exception type that signals the error should be ignored by retry logic."""
38
+
39
+ pass
40
+
41
+
42
+ class NotFoundError(ManagedHTTPError):
43
+ def __init__(
44
+ self,
45
+ message: Optional[str] = None,
46
+ response: Optional[Response] = None,
47
+ request: Optional[Request] = None,
48
+ *args,
49
+ **kwargs,
50
+ ):
51
+ if not message:
52
+ status = getattr(response, "status_code", 404)
53
+ req = request or getattr(response, "request", None)
54
+ url = getattr(req, "url", "unknown")
55
+ message = f"HTTP {status} for {url}"
56
+ super().__init__(message, response, request, *args, **kwargs)
57
+
58
+
59
+ class BadRequestError(ManagedHTTPError):
60
+ def __init__(
61
+ self,
62
+ message: Optional[str] = None,
63
+ response: Optional[Response] = None,
64
+ request: Optional[Request] = None,
65
+ *args,
66
+ **kwargs,
67
+ ):
68
+ if not message:
69
+ status = getattr(response, "status_code", 400)
70
+ req = request or getattr(response, "request", None)
71
+ url = getattr(req, "url", "unknown")
72
+ message = f"HTTP {status} for {url}"
73
+ super().__init__(message, response, request, *args, **kwargs)
74
+
75
+
76
+ class EmptyResponseError(ManagedHTTPError):
77
+ def __init__(
78
+ self,
79
+ message: Optional[str] = None,
80
+ response: Optional[Response] = None,
81
+ request: Optional[Request] = None,
82
+ *args,
83
+ **kwargs,
84
+ ):
85
+ if not message:
86
+ req = request or getattr(response, "request", None)
87
+ url = getattr(req, "url", "unknown")
88
+ message = f"Empty response for {url}"
89
+ super().__init__(message, response, request, *args, **kwargs)
90
+
91
+
92
+ class RedirectionDetectedError(ManagedHTTPError):
93
+ def __init__(
94
+ self,
95
+ message: Optional[str] = None,
96
+ response: Optional[Response] = None,
97
+ request: Optional[Request] = None,
98
+ *args,
99
+ **kwargs,
100
+ ):
101
+ if not message:
102
+ status = getattr(response, "status_code", 300)
103
+ req = request or getattr(response, "request", None)
104
+ url = getattr(req, "url", "unknown")
105
+ message = f"HTTP {status} for {url}"
106
+ super().__init__(message, response, request, *args, **kwargs)
107
+
108
+
109
+ class NoWorkingProxiesError(Exception):
110
+ def __init__(self, message="No working proxies available"):
7
111
  self.message = message
8
112
  super().__init__(self.message)
9
113
 
10
114
 
11
- class NotFoundError(Exception):
12
- def __init__(self, message="Not found!"):
115
+ class EnsureNewIPTimeoutError(Exception):
116
+ def __init__(self, message="Timed out waiting for new IP"):
13
117
  self.message = message
14
118
  super().__init__(self.message)
@@ -3,6 +3,7 @@
3
3
 
4
4
  import logging
5
5
  from collections.abc import MutableMapping
6
+ from enum import Enum, auto
6
7
  from typing import Any, Iterator, List, Optional, Type, TypeVar, Union
7
8
  from urllib.parse import quote_plus
8
9
 
@@ -12,7 +13,6 @@ from sqlalchemy.exc import IntegrityError
12
13
  from sqlalchemy.ext.declarative import DeclarativeMeta
13
14
  from sqlalchemy.orm import Session, sessionmaker
14
15
  from sqlalchemy.sql.expression import ClauseElement
15
- from enum import Enum, auto
16
16
 
17
17
  ########################################################################################################################
18
18
  # CLASSES
@@ -195,6 +195,31 @@ class AlchemyInterface:
195
195
 
196
196
  query_results.update({column_name: default_value}, synchronize_session=False)
197
197
 
198
+ @staticmethod
199
+ def _log_integrity_error(ex: IntegrityError, alchemy_obj, action="insert"):
200
+ """
201
+ Compact, readable IntegrityError logger using SQLSTATE codes.
202
+ Consult https://www.postgresql.org/docs/current/errcodes-appendix.html for details.
203
+ """
204
+
205
+ PG_ERROR_LABELS = {
206
+ "23000": "Integrity constraint violation",
207
+ "23001": "Restrict violation",
208
+ "23502": "NOT NULL violation",
209
+ "23503": "Foreign key violation",
210
+ "23505": "Unique violation",
211
+ "23514": "Check constraint violation",
212
+ "23P01": "Exclusion constraint violation",
213
+ }
214
+ code = getattr(ex.orig, "pgcode", None)
215
+ label = PG_ERROR_LABELS.get(code, "Integrity error (unspecified)")
216
+
217
+ # Log one clean message with trace + the raw DB message separately
218
+ if code == "23505": # A simple info log for unique violations
219
+ logger.info(f"{label} trying to {action} {alchemy_obj}")
220
+ else:
221
+ logger.error(f"{label} trying to {action} {alchemy_obj}\nPostgreSQL message: {ex.orig}")
222
+
198
223
  def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> bool:
199
224
  if self.session is None:
200
225
  raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
@@ -205,10 +230,10 @@ class AlchemyInterface:
205
230
  if not silent:
206
231
  logger.info(f"adding {alchemy_obj}...")
207
232
  self.session.add(alchemy_obj)
208
- except IntegrityError:
233
+ except IntegrityError as ex:
209
234
  # Rollback is handled automatically by begin_nested() context manager on error
210
235
  if not silent:
211
- logger.info(f"{alchemy_obj} already in db (savepoint rolled back)")
236
+ self._log_integrity_error(ex, alchemy_obj, action="insert")
212
237
  # Do not re-raise, allow outer transaction/loop to continue
213
238
  return False
214
239
 
@@ -264,10 +289,10 @@ class AlchemyInterface:
264
289
  # Use a savepoint (nested transaction)
265
290
  with self.session.begin_nested():
266
291
  self.session.execute(statement)
267
- except IntegrityError:
292
+ except IntegrityError as ex:
268
293
  # Rollback is handled automatically by begin_nested() context manager on error
269
294
  if not silent:
270
- logger.info(f"could not upsert {alchemy_obj} (savepoint rolled back)")
295
+ self._log_integrity_error(ex, alchemy_obj, action="upsert")
271
296
  # Do not re-raise, allow outer transaction/loop to continue
272
297
  return False
273
298
 
@@ -3,8 +3,9 @@
3
3
 
4
4
  import io
5
5
  import logging
6
+ from typing import Any, Dict, List, Optional
7
+
6
8
  import boto3
7
- from typing import Optional, List, Dict, Any
8
9
 
9
10
  ########################################################################################################################
10
11
  # CLASSES
@@ -82,6 +83,14 @@ class AWSInterface:
82
83
  return
83
84
  logger.warning(f"Profile {profile_name} not found")
84
85
 
86
+ def get_bucket_url(self) -> Optional[str]:
87
+ """Return active bucket URL."""
88
+ if not self.bucket:
89
+ logger.warning("No active bucket selected")
90
+ return None
91
+ region = self.s3_client.meta.region_name
92
+ return f"https://{self.bucket}.s3.{region}.amazonaws.com"
93
+
85
94
  def get_file(self, s3_path: str):
86
95
  if not self.bucket:
87
96
  logger.warning("No active bucket selected")
@@ -92,14 +101,27 @@ class AWSInterface:
92
101
  logger.info(f"{s3_path} does not exist")
93
102
  return None
94
103
 
104
+ def file_exists(self, s3_path: str) -> bool:
105
+ if not self.bucket:
106
+ logger.warning("No active bucket selected")
107
+ return False
108
+ try:
109
+ self.s3_client.head_object(Bucket=self.bucket, Key=s3_path)
110
+ return True
111
+ except self.s3_client.exceptions.NoSuchKey:
112
+ return False
113
+ except Exception as e:
114
+ logger.error(f"Error checking existence of {s3_path}: {e}")
115
+ raise
116
+
95
117
  def read_file_as_bytes(self, s3_path: str) -> Optional[io.BytesIO]:
96
118
  obj = self.get_file(s3_path)
97
119
  if not obj:
98
120
  return None
99
121
  return io.BytesIO(obj["Body"].read())
100
122
 
101
- def upload_file(self, local_path: str, s3_path: str) -> None:
123
+ def upload_file(self, local_path: str, s3_path: str, **kwargs) -> None:
102
124
  if not self.bucket:
103
125
  logger.warning("No active bucket selected")
104
126
  return
105
- self.s3.Bucket(self.bucket).upload_file(local_path, s3_path)
127
+ self.s3.Bucket(self.bucket).upload_file(local_path, s3_path, **kwargs)
@@ -5,7 +5,7 @@ import logging
5
5
  from pathlib import Path
6
6
  from typing import Any, Dict, List, Optional
7
7
 
8
- from azure.storage.blob import BlobServiceClient
8
+ from azure.storage.blob import BlobServiceClient, ContainerClient
9
9
  from pendulum import now
10
10
 
11
11
  ########################################################################################################################
@@ -24,22 +24,26 @@ class AzureBlobInterface:
24
24
  profile_name = section.split(":", 1)[1]
25
25
  connection_string = self.config[section].get("connection_string")
26
26
  container_name = self.config[section].get("container_name")
27
+ sas_container_url = self.config[section].get("sas_container_url")
28
+
29
+ if sas_container_url:
30
+ session = ContainerClient.from_container_url(sas_container_url)
31
+ elif connection_string and container_name:
32
+ session = BlobServiceClient.from_connection_string(connection_string).get_container_client(
33
+ container_name
34
+ )
27
35
 
28
36
  self.profiles.append(
29
37
  {
30
38
  "profile": profile_name,
31
39
  "container_name": container_name,
32
- "session": BlobServiceClient.from_connection_string(
33
- connection_string
34
- ).get_container_client(container_name),
40
+ "session": session,
35
41
  }
36
42
  )
37
43
 
38
44
  if not self.profiles:
39
45
  logger.warning("No Azure profiles found in config file")
40
- self.current_profile: Optional[Dict[str, Any]] = (
41
- self.profiles[0] if self.profiles else None
42
- )
46
+ self.current_profile: Optional[Dict[str, Any]] = self.profiles[0] if self.profiles else None
43
47
 
44
48
  def switch_profile(self, profile_name: str) -> None:
45
49
  for profile in self.profiles:
@@ -54,7 +58,7 @@ class AzureBlobInterface:
54
58
  remote_folder,
55
59
  remote_file=None,
56
60
  upload_file_info=False,
57
- **kwargs,
61
+ **file_info_data,
58
62
  ):
59
63
  if not remote_file:
60
64
  remote_file = Path(local_file).name
@@ -66,16 +70,16 @@ class AzureBlobInterface:
66
70
  blob_client.upload_blob(data, overwrite=True)
67
71
 
68
72
  if upload_file_info:
69
- self.upload_file_info(remote_path, **kwargs)
73
+ self.upload_file_info(remote_path, **file_info_data)
70
74
 
71
- def upload_file_info(self, remote_path, **kwargs):
75
+ def upload_file_info(self, remote_path, **file_info_data):
72
76
  summary_file = remote_path.split(".")[0] + "_resumen.csv"
73
77
  blob_client = self.current_profile["session"].get_blob_client(summary_file)
74
78
 
75
79
  new_record = {
76
80
  "file": remote_path,
77
- "num_rows": kwargs["num_rows"],
78
- "schema_version": kwargs["schema_version"],
81
+ "num_rows": file_info_data.get("num_rows"),
82
+ "schema_version": file_info_data.get("schema_version"),
79
83
  "upload_date": now(tz="Europe/Madrid").to_datetime_string(),
80
84
  }
81
85
 
@@ -109,14 +113,10 @@ class AzureBlobInterface:
109
113
  if blob_client.exists():
110
114
  properties = blob_client.get_blob_properties()
111
115
  if properties.size > 100: # Check if size is greater than 100 bytes
112
- logger.debug(
113
- f"Blob '{remote_path}' exists and is not empty (size: {properties.size})."
114
- )
116
+ logger.debug(f"Blob '{remote_path}' exists and is not empty (size: {properties.size}).")
115
117
  return True
116
118
  else:
117
- logger.debug(
118
- f"Blob '{remote_path}' exists but size ({properties.size}) is not > 100 bytes."
119
- )
119
+ logger.debug(f"Blob '{remote_path}' exists but size ({properties.size}) is not > 100 bytes.")
120
120
  return False
121
121
  else:
122
122
  logger.debug(f"Blob '{remote_path}' does not exist.")
@@ -20,22 +20,14 @@ class FTPInterface:
20
20
  if section.startswith("ftp:"):
21
21
  profile_name = section.split(":", 1)[1]
22
22
  ftps = self.config[section]["ftps"].lower() == "true"
23
- ftp_conn = (
24
- FTP_TLS(self.config[section]["server"])
25
- if ftps
26
- else FTP(self.config[section]["server"])
27
- ) # noqa: S321
28
- ftp_conn.login(
29
- self.config[section]["username"], self.config[section]["password"]
30
- )
23
+ ftp_conn = FTP_TLS(self.config[section]["server"]) if ftps else FTP(self.config[section]["server"]) # noqa: S321
24
+ ftp_conn.login(self.config[section]["username"], self.config[section]["password"])
31
25
  self.profiles.append({"profile": profile_name, "session": ftp_conn})
32
26
 
33
27
  if not self.profiles:
34
28
  logger.warning("no ftp section in config")
35
29
 
36
- self.current_profile: Optional[Dict[str, Any]] = (
37
- self.profiles[0] if self.profiles else None
38
- )
30
+ self.current_profile: Optional[Dict[str, Any]] = self.profiles[0] if self.profiles else None
39
31
  self.ftp = self.current_profile["session"] if self.current_profile else None
40
32
 
41
33
  def switch_profile(self, profile_name: str) -> None:
@@ -10,7 +10,14 @@ import requests
10
10
  from geopy.distance import geodesic
11
11
  from jellyfish import jaro_winkler_similarity
12
12
 
13
- from ..params.nominatim import CITY_TO_PROVINCE, POSTCODES, STATES
13
+ from ..params.nominatim import (
14
+ CITY_TO_PROVINCE,
15
+ MADRID_DISTRICT_DIRECT_PATCH,
16
+ MADRID_DISTRICT_QUARTER_PATCH,
17
+ MADRID_QUARTER_DIRECT_PATCH,
18
+ POSTCODES,
19
+ )
20
+ from ..utils.nominatim import standardize_admin_division
14
21
  from ..utils.strings import normalize
15
22
 
16
23
  ########################################################################################################################
@@ -141,24 +148,6 @@ class Nominatim:
141
148
  "number": None,
142
149
  }
143
150
 
144
- @staticmethod
145
- def _canonicalize_state(state: Optional[str]) -> Optional[str]:
146
- """
147
- Canonicalize the state name using similarity. The most similar canonical state name is
148
- returned if the similarity score is above the threshold.
149
- """
150
- if not state:
151
- return None
152
- norm_state = normalize(state)
153
- best_match = None
154
- best_score = 0.0
155
- for canonical in STATES:
156
- score = jaro_winkler_similarity(norm_state, normalize(canonical))
157
- if score > best_score:
158
- best_score = score
159
- best_match = canonical
160
- return best_match if best_score > JARO_WINKLER_THRESHOLD else None
161
-
162
151
  def _select_postcode_and_derived_province(
163
152
  self,
164
153
  parsed_nominatim_result: Dict[str, Optional[str]],
@@ -214,7 +203,7 @@ class Nominatim:
214
203
  )
215
204
 
216
205
  # 2) If no raw province, compare with province from Nominatim PC **only when** Nominatim is close
217
- if not geonames_pc_valid and not norm_raw_province and nominatim_is_close:
206
+ if not geonames_pc_valid and not norm_raw_province and nominatim_is_close: # noqa: SIM102
218
207
  if norm_province_from_geonames_pc and norm_province_from_nominatim_pc:
219
208
  geonames_pc_valid = (
220
209
  jaro_winkler_similarity(norm_province_from_geonames_pc, norm_province_from_nominatim_pc)
@@ -243,9 +232,6 @@ class Nominatim:
243
232
  if not state and nominatim_pc_valid:
244
233
  state = parsed_nominatim_result.get("state")
245
234
 
246
- # Canonicalize
247
- state = self._canonicalize_state(state)
248
-
249
235
  return postcode, province, state
250
236
 
251
237
  def _select_final_result(
@@ -293,14 +279,32 @@ class Nominatim:
293
279
 
294
280
  return final_result
295
281
 
282
+ @staticmethod
283
+ def _patch_district(raw_district: str, raw_quarter: str = None):
284
+ """
285
+ Patches the district name, optionally using the quarter for specific patches.
286
+ """
287
+ if raw_quarter:
288
+ # If raw_quarter is provided, use the tuple (district, quarter) as the key.
289
+ key = (raw_district, raw_quarter)
290
+ return MADRID_DISTRICT_QUARTER_PATCH.get(key, raw_district)
291
+ else:
292
+ return MADRID_DISTRICT_DIRECT_PATCH.get(raw_district, raw_district)
293
+
294
+ @staticmethod
295
+ def _patch_quarter(raw_quarter: str):
296
+ """
297
+ Patches the quarter name directly.
298
+ """
299
+ return MADRID_QUARTER_DIRECT_PATCH.get(raw_quarter, raw_quarter)
300
+
296
301
  def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
297
302
  district = self._get_attribute(raw_json, ["city_district", "suburb", "borough"])
298
303
  quarter = self._get_attribute(raw_json, ["quarter", "neighbourhood"])
299
-
300
- if not district and quarter:
301
- district = quarter
302
- quarter = None
303
-
304
+ if (city := raw_json.get("city")) and city == "Madrid":
305
+ mid_district = self._patch_district(district)
306
+ quarter = self._patch_quarter(quarter)
307
+ district = self._patch_district(mid_district, quarter)
304
308
  return district, quarter
305
309
 
306
310
  def geocode(self, address: str) -> List[Dict[str, Any]]:
@@ -356,6 +360,13 @@ class Nominatim:
356
360
  selected_state,
357
361
  )
358
362
 
363
+ # Standardize
364
+ final_result["province"] = standardize_admin_division(
365
+ name=final_result["province"], level="province", country_code=final_result["country_code"]
366
+ )
367
+ final_result["state"] = standardize_admin_division(
368
+ name=final_result["state"], level="state", country_code=final_result["country_code"]
369
+ )
359
370
  return final_result
360
371
 
361
372