datamarket 0.7.91__py3-none-any.whl → 0.7.92__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -195,6 +195,29 @@ class AlchemyInterface:
195
195
 
196
196
  query_results.update({column_name: default_value}, synchronize_session=False)
197
197
 
198
+ @staticmethod
199
+ def _log_integrity_error(ex: IntegrityError, alchemy_obj, action="insert"):
200
+ """
201
+ Compact, readable IntegrityError logger using SQLSTATE codes.
202
+ Consult https://www.postgresql.org/docs/current/errcodes-appendix.html for details.
203
+ """
204
+
205
+ PG_ERROR_LABELS = {
206
+ "23000": "Integrity constraint violation",
207
+ "23001": "Restrict violation",
208
+ "23502": "NOT NULL violation",
209
+ "23503": "Foreign key violation",
210
+ "23505": "Unique violation",
211
+ "23514": "Check constraint violation",
212
+ "23P01": "Exclusion constraint violation",
213
+ }
214
+ code = getattr(ex.orig, "pgcode", None)
215
+ label = PG_ERROR_LABELS.get(code, "Integrity error (unspecified)")
216
+
217
+ # Log one clean message with trace + the raw DB message separately
218
+ logger.error(f"{label} trying to {action} {alchemy_obj}\nPostgreSQL message: {ex.orig}")
219
+
220
+
198
221
  def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> bool:
199
222
  if self.session is None:
200
223
  raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
@@ -205,10 +228,10 @@ class AlchemyInterface:
205
228
  if not silent:
206
229
  logger.info(f"adding {alchemy_obj}...")
207
230
  self.session.add(alchemy_obj)
208
- except IntegrityError:
231
+ except IntegrityError as ex:
209
232
  # Rollback is handled automatically by begin_nested() context manager on error
210
233
  if not silent:
211
- logger.info(f"{alchemy_obj} already in db (savepoint rolled back)")
234
+ self._log_integrity_error(ex, alchemy_obj, action="insert")
212
235
  # Do not re-raise, allow outer transaction/loop to continue
213
236
  return False
214
237
 
@@ -264,10 +287,10 @@ class AlchemyInterface:
264
287
  # Use a savepoint (nested transaction)
265
288
  with self.session.begin_nested():
266
289
  self.session.execute(statement)
267
- except IntegrityError:
290
+ except IntegrityError as ex:
268
291
  # Rollback is handled automatically by begin_nested() context manager on error
269
292
  if not silent:
270
- logger.info(f"could not upsert {alchemy_obj} (savepoint rolled back)")
293
+ self._log_integrity_error(ex, alchemy_obj, action="upsert")
271
294
  # Do not re-raise, allow outer transaction/loop to continue
272
295
  return False
273
296
 
@@ -10,8 +10,9 @@ import requests
10
10
  from geopy.distance import geodesic
11
11
  from jellyfish import jaro_winkler_similarity
12
12
 
13
- from ..params.nominatim import CITY_TO_PROVINCE, POSTCODES, STATES
13
+ from ..params.nominatim import CITY_TO_PROVINCE, POSTCODES
14
14
  from ..utils.strings import normalize
15
+ from ..utils.nominatim import standardize_admin_division
15
16
 
16
17
  ########################################################################################################################
17
18
  # PARAMETERS
@@ -141,24 +142,6 @@ class Nominatim:
141
142
  "number": None,
142
143
  }
143
144
 
144
- @staticmethod
145
- def _canonicalize_state(state: Optional[str]) -> Optional[str]:
146
- """
147
- Canonicalize the state name using similarity. The most similar canonical state name is
148
- returned if the similarity score is above the threshold.
149
- """
150
- if not state:
151
- return None
152
- norm_state = normalize(state)
153
- best_match = None
154
- best_score = 0.0
155
- for canonical in STATES:
156
- score = jaro_winkler_similarity(norm_state, normalize(canonical))
157
- if score > best_score:
158
- best_score = score
159
- best_match = canonical
160
- return best_match if best_score > JARO_WINKLER_THRESHOLD else None
161
-
162
145
  def _select_postcode_and_derived_province(
163
146
  self,
164
147
  parsed_nominatim_result: Dict[str, Optional[str]],
@@ -243,9 +226,6 @@ class Nominatim:
243
226
  if not state and nominatim_pc_valid:
244
227
  state = parsed_nominatim_result.get("state")
245
228
 
246
- # Canonicalize
247
- state = self._canonicalize_state(state)
248
-
249
229
  return postcode, province, state
250
230
 
251
231
  def _select_final_result(
@@ -355,7 +335,16 @@ class Nominatim:
355
335
  selected_province_from_postcode,
356
336
  selected_state,
357
337
  )
358
-
338
+
339
+ # Standardize
340
+ final_result["province"] = standardize_admin_division(
341
+ name=final_result["province"],
342
+ level="province",
343
+ country_code=final_result["country_code"])
344
+ final_result["state"] = standardize_admin_division(
345
+ name=final_result["state"],
346
+ level="state",
347
+ country_code=final_result["country_code"])
359
348
  return final_result
360
349
 
361
350
 
@@ -369,4 +358,4 @@ class NominatimInterface(Nominatim):
369
358
 
370
359
  super().__init__(self.nominatim_endpoint, self.geonames_endpoint)
371
360
  else:
372
- logger.warning("no osm section in config")
361
+ logger.warning("no osm section in config")
@@ -55,24 +55,90 @@ POSTCODES = {
55
55
  "52": "Melilla",
56
56
  }
57
57
 
58
+ # Mapping of normalized names (for comparison) to standardized names (for storing)
59
+ # for each corresponding country code
58
60
  STATES = {
59
- "Andalucía",
60
- "Aragón",
61
- "Asturias",
62
- "Baleares",
63
- "Canarias",
64
- "Cantabria",
65
- "Castilla-La Mancha",
66
- "Castilla y León",
67
- "Cataluña",
68
- "Ceuta",
69
- "Comunidad Valenciana",
70
- "Extremadura",
71
- "Galicia",
72
- "La Rioja",
73
- "Madrid",
74
- "Melilla",
75
- "Murcia",
76
- "Navarra",
77
- "País Vasco",
61
+ "es": {
62
+ "andalucia": "Andalucía",
63
+ "aragon": "Aragón",
64
+ "asturias": "Asturias",
65
+ "baleares": "Baleares",
66
+ "canarias": "Canarias",
67
+ "cantabria": "Cantabria",
68
+ "castilla la mancha": "Castilla-La Mancha",
69
+ "castilla y leon": "Castilla y León",
70
+ "cataluna": "Cataluña",
71
+ "ceuta": "Ceuta",
72
+ "comunidad valenciana": "Comunidad Valenciana",
73
+ "extremadura": "Extremadura",
74
+ "galicia": "Galicia",
75
+ "la rioja": "La Rioja",
76
+ "madrid": "Comunidad de Madrid",
77
+ "melilla": "Melilla",
78
+ "murcia": "Murcia",
79
+ "navarra": "Navarra",
80
+ "pais vasco": "País Vasco",
81
+ "euskadi": "País Vasco", # Alias not caught by rapidfuzz
82
+ }
78
83
  }
84
+
85
+ PROVINCES = {
86
+ "es": {
87
+ "alava": "Álava",
88
+ "araba": "Álava", # Alias not caught by rapidfuzz
89
+ "albacete": "Albacete",
90
+ "alicante": "Alicante",
91
+ "almeria": "Almería",
92
+ "asturias": "Asturias",
93
+ "avila": "Ávila",
94
+ "badajoz": "Badajoz",
95
+ "barcelona": "Barcelona",
96
+ "bizkaia": "Vizcaya",
97
+ "burgos": "Burgos",
98
+ "caceres": "Cáceres",
99
+ "cadiz": "Cádiz",
100
+ "cantabria": "Cantabria",
101
+ "castellon": "Castellón",
102
+ "ceuta": "Ceuta", # Considered province by opensm and/or geonames
103
+ "ciudad real": "Ciudad Real",
104
+ "cordoba": "Córdoba",
105
+ "cuenca": "Cuenca",
106
+ "gipuzkoa": "Gipuzkoa",
107
+ "gerona": "Gerona",
108
+ "granada": "Granada",
109
+ "guadalajara": "Guadalajara",
110
+ "huelva": "Huelva",
111
+ "huesca": "Huesca",
112
+ "islas baleares": "Islas Baleares",
113
+ "jaen": "Jaén",
114
+ "la coruna": "La Coruña",
115
+ "la rioja": "La Rioja",
116
+ "las palmas": "Las Palmas",
117
+ "leon": "León",
118
+ "lerida": "Lérida",
119
+ "lugo": "Lugo",
120
+ "madrid": "Madrid",
121
+ "malaga": "Málaga",
122
+ "melilla": "Melilla", # Considered province by opensm and/or geonames
123
+ "murcia": "Murcia",
124
+ "navarra": "Navarra",
125
+ "orense": "Orense",
126
+ "palencia": "Palencia",
127
+ "pontevedra": "Pontevedra",
128
+ "salamanca": "Salamanca",
129
+ "santa cruz de tenerife": "Santa Cruz de Tenerife",
130
+ "segovia": "Segovia",
131
+ "sevilla": "Sevilla",
132
+ "soria": "Soria",
133
+ "tarragona": "Tarragona",
134
+ "teruel": "Teruel",
135
+ "toledo": "Toledo",
136
+ "valencia": "Valencia",
137
+ "valladolid": "Valladolid",
138
+ "zamora": "Zamora",
139
+ "zaragoza": "Zaragoza",
140
+ }
141
+ }
142
+
143
+ # Cutoff score for rapidfuzz in the name standardization function
144
+ STANDARD_THRESHOLD = 40
datamarket/utils/main.py CHANGED
@@ -9,6 +9,7 @@ import re
9
9
  import shlex
10
10
  import subprocess
11
11
  import time
12
+ from babel.numbers import parse_decimal
12
13
 
13
14
  from bs4 import BeautifulSoup
14
15
  import pendulum
@@ -89,6 +90,19 @@ def text_to_int(text):
89
90
  return num
90
91
 
91
92
 
93
+ def text_to_float(text: str | None, locale: str = "es_ES") -> float | None:
94
+ if not text:
95
+ return None
96
+ match = re.search(r"\d(?:[\d\s.,]*\d)?", text)
97
+ if not match:
98
+ return None
99
+ number_str = match.group(0).replace(" ", "")
100
+ try:
101
+ return float(parse_decimal(number_str, locale=locale))
102
+ except Exception:
103
+ return None
104
+
105
+
92
106
  def sleep_out_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
93
107
  while pendulum.now(tz=tz).hour >= to_h or pendulum.now(tz=tz).hour < from_h:
94
108
  logger.warning("time to sleep and not scrape anything...")
@@ -0,0 +1,38 @@
1
+ from typing import Optional, Literal
2
+ from rapidfuzz import fuzz, process
3
+ from ..params.nominatim import STATES, PROVINCES, STANDARD_THRESHOLD
4
+ from .strings import normalize
5
+
6
+ def standardize_admin_division(
7
+ name: str,
8
+ level: Literal["province", "state"] = "province",
9
+ country_code: str = "es"
10
+ ) -> Optional[str]:
11
+ """
12
+ Normalize and standardize administrative divisions of a given country using RapidFuzz.
13
+ Uses normalized dict keys for comparison and returns dict values with the official names.
14
+ """
15
+ if not name:
16
+ return None
17
+
18
+ country_code = country_code.lower()
19
+ mapping = STATES.get(country_code) if level == "state" else PROVINCES.get(country_code)
20
+
21
+ if not mapping: # If country is not standardized, return raw name
22
+ return name
23
+
24
+ normalized_name = normalize(name) # Essential for rapidfuzz to work well
25
+ result = process.extractOne(
26
+ normalized_name,
27
+ mapping.keys(), # Compare with the normalized names in the dict
28
+ scorer=fuzz.WRatio,
29
+ score_cutoff=STANDARD_THRESHOLD,
30
+ )
31
+
32
+ if not result:
33
+ return None
34
+
35
+ best_key, score, _ = result
36
+
37
+ # Return the standardized name corresponding to the normalized name
38
+ return mapping[best_key]
@@ -215,4 +215,4 @@ def normalize(
215
215
  if naming is NamingConvention.PASCAL:
216
216
  return camelize(underscored)
217
217
 
218
- return underscored
218
+ return underscored
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.91
3
+ Version: 0.7.92
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -64,6 +64,7 @@ Provides-Extra: undetected-chromedriver
64
64
  Provides-Extra: xmltodict
65
65
  Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0)
66
66
  Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
67
+ Requires-Dist: babel (>=2.0.0,<3.0.0)
67
68
  Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
68
69
  Requires-Dist: boto3 (>=1.35.0,<1.36.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
69
70
  Requires-Dist: browserforge (>=1.2.0,<2.0.0) ; extra == "camoufox"
@@ -2,32 +2,33 @@ datamarket/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  datamarket/exceptions/__init__.py,sha256=-Vu-RZNKjW6fYCLqbUJTkKNuHeA8Yi_gyR50oZNaA_8,33
3
3
  datamarket/exceptions/main.py,sha256=MP5ql6M7DoMbBf-Dg_2ohcUFdWXgzv-dXHntPPit31s,453
4
4
  datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- datamarket/interfaces/alchemy.py,sha256=mQwjDqBpz1QHRV2JTCALvn5iK_ky69oE2Gw-EtRXsqQ,14664
5
+ datamarket/interfaces/alchemy.py,sha256=TquEsTeYmiFaiyaTgIx0yF3McQeRdY5YVMsTOXrsT9s,15592
6
6
  datamarket/interfaces/aws.py,sha256=zfFEtIz6PsLMjKPOUcYuVgBcBIUkuNI2xKW-QturS30,4314
7
7
  datamarket/interfaces/azure.py,sha256=PnPlo95skYiq63qYa4QDvEnVYi2JblPmMSfbTsmXhFs,4937
8
8
  datamarket/interfaces/drive.py,sha256=3nhx3THr2SHNWKYwme9F2nPpvsqyEMFIxz0whF2FjHk,4840
9
9
  datamarket/interfaces/ftp.py,sha256=K219-PP21EhQo1A1LkvRLahlrw2-pf4svBN0LogZaJE,2813
10
- datamarket/interfaces/nominatim.py,sha256=HLk0FcdfbOVCF_i71l-Hlb17swL0W1a3Gg2n5OLD0tM,15507
10
+ datamarket/interfaces/nominatim.py,sha256=TjS9O2U446XuPUzfP65NwDSG-RDNqmYb6-NKikM-34w,15187
11
11
  datamarket/interfaces/peerdb.py,sha256=sO451wEGNb_0DDwchZ6eBVYKltqHM5XKau-WsfspXzA,23640
12
12
  datamarket/interfaces/proxy.py,sha256=Uu-dHvpQOLNBZPGHAanLXnKT1789ArcHfOw8exECt34,5398
13
13
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
14
14
  datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- datamarket/params/nominatim.py,sha256=fELSriTLSpotoT_k6Ft98fnD8EKHA8WopxPYPLSRfgs,1531
15
+ datamarket/params/nominatim.py,sha256=g_zx4WYumj8F2Z9xSCvLXLGv-PuAckCGnSxU821u2O8,3986
16
16
  datamarket/utils/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
17
17
  datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
18
18
  datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
19
- datamarket/utils/main.py,sha256=WweHHt3Ti-tVXdmLnpNYGsYpyTaCx_o1mvnL7_NomVY,5450
19
+ datamarket/utils/main.py,sha256=KYHjDOps6_Q3TFV_Jj7MLj-L9Evx05AXELCvp06BARU,5857
20
+ datamarket/utils/nominatim.py,sha256=R0yuEQ6aOG_eM1D-PeMlTV0GtvhWoAkzpR-xYx8U_g0,1247
20
21
  datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
22
  datamarket/utils/playwright/async_api.py,sha256=UbA2D4ScBtYeMfrRjly4RO-s8wXIub9c05J1eoOCpsQ,5782
22
23
  datamarket/utils/playwright/sync_api.py,sha256=Tw_-KLB3vipFuEQwcX8iCbj7giCzcwXB-bhl_ncR-2Q,5542
23
24
  datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
24
25
  datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
25
26
  datamarket/utils/strings/__init__.py,sha256=b6TYOT9v7y9ID-lDyZk4E8BH2uIPbsF2ZSLGjCQ1MCQ,43
26
- datamarket/utils/strings/normalization.py,sha256=rj0wfJSjqcCRp-ruHqc5pylO3_TOmY5_V1lKzkyWoAA,8991
27
+ datamarket/utils/strings/normalization.py,sha256=4HSs-8y1k36_xett0nqN2ND_A3xFn4591n5I-lhpaLM,8990
27
28
  datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnmH3FPlgUUjM,5246
28
29
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
29
30
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
30
- datamarket-0.7.91.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
31
- datamarket-0.7.91.dist-info/METADATA,sha256=t_iQOXjo_CHGC7XwOBnPtWTTWypi-MaXrQBawy103vM,7348
32
- datamarket-0.7.91.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
33
- datamarket-0.7.91.dist-info/RECORD,,
31
+ datamarket-0.7.92.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
32
+ datamarket-0.7.92.dist-info/METADATA,sha256=OtPM2d_9vLMdOfAACyAnS2UpLDgJKRcDk-fp5ERKTG4,7386
33
+ datamarket-0.7.92.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
34
+ datamarket-0.7.92.dist-info/RECORD,,