datamarket 0.7.97__tar.gz → 0.7.98__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (37) hide show
  1. {datamarket-0.7.97 → datamarket-0.7.98}/PKG-INFO +1 -1
  2. {datamarket-0.7.97 → datamarket-0.7.98}/pyproject.toml +1 -1
  3. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/interfaces/alchemy.py +4 -1
  4. datamarket-0.7.98/src/datamarket/params/nominatim.py +427 -0
  5. datamarket-0.7.98/src/datamarket/utils/nominatim.py +204 -0
  6. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/strings/normalization.py +10 -5
  7. datamarket-0.7.98/src/datamarket/utils/strings/standardization.py +69 -0
  8. datamarket-0.7.97/src/datamarket/params/nominatim.py +0 -144
  9. datamarket-0.7.97/src/datamarket/utils/nominatim.py +0 -38
  10. {datamarket-0.7.97 → datamarket-0.7.98}/LICENSE +0 -0
  11. {datamarket-0.7.97 → datamarket-0.7.98}/README.md +0 -0
  12. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/__init__.py +0 -0
  13. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/exceptions/__init__.py +0 -0
  14. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/exceptions/main.py +0 -0
  15. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/interfaces/__init__.py +0 -0
  16. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/interfaces/aws.py +0 -0
  17. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/interfaces/azure.py +0 -0
  18. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/interfaces/drive.py +0 -0
  19. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/interfaces/ftp.py +0 -0
  20. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/interfaces/nominatim.py +0 -0
  21. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/interfaces/peerdb.py +0 -0
  22. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/interfaces/proxy.py +0 -0
  23. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/interfaces/tinybird.py +0 -0
  24. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/params/__init__.py +0 -0
  25. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/__init__.py +0 -0
  26. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/airflow.py +0 -0
  27. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/alchemy.py +0 -0
  28. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/main.py +0 -0
  29. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/playwright/__init__.py +0 -0
  30. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/playwright/async_api.py +0 -0
  31. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/playwright/sync_api.py +0 -0
  32. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/selenium.py +0 -0
  33. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/soda.py +0 -0
  34. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/strings/__init__.py +0 -0
  35. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/strings/obfuscation.py +0 -0
  36. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/typer.py +0 -0
  37. {datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.97
3
+ Version: 0.7.98
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.7.97"
3
+ version = "0.7.98"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -215,7 +215,10 @@ class AlchemyInterface:
215
215
  label = PG_ERROR_LABELS.get(code, "Integrity error (unspecified)")
216
216
 
217
217
  # Log one clean message with trace + the raw DB message separately
218
- logger.error(f"{label} trying to {action} {alchemy_obj}\nPostgreSQL message: {ex.orig}")
218
+ if code == "23505": # A simple info log for unique violations
219
+ logger.info(f"{label} trying to {action} {alchemy_obj}")
220
+ else:
221
+ logger.error(f"{label} trying to {action} {alchemy_obj}\nPostgreSQL message: {ex.orig}")
219
222
 
220
223
 
221
224
  def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> bool:
@@ -0,0 +1,427 @@
1
+ from unidecode import unidecode
2
+ import re
3
+
4
+ CITY_TO_PROVINCE = {"Madrid": "Madrid"}
5
+
6
+ POSTCODES = {
7
+ "01": "Álava",
8
+ "02": "Albacete",
9
+ "03": "Alicante",
10
+ "04": "Almería",
11
+ "05": "Ávila",
12
+ "06": "Badajoz",
13
+ "07": "Baleares",
14
+ "08": "Barcelona",
15
+ "09": "Burgos",
16
+ "10": "Cáceres",
17
+ "11": "Cádiz",
18
+ "12": "Castellón",
19
+ "13": "Ciudad Real",
20
+ "14": "Córdoba",
21
+ "15": "La Coruña",
22
+ "16": "Cuenca",
23
+ "17": "Gerona",
24
+ "18": "Granada",
25
+ "19": "Guadalajara",
26
+ "20": "Guipúzcoa",
27
+ "21": "Huelva",
28
+ "22": "Huesca",
29
+ "23": "Jaén",
30
+ "24": "León",
31
+ "25": "Lérida",
32
+ "26": "La Rioja",
33
+ "27": "Lugo",
34
+ "28": "Madrid",
35
+ "29": "Málaga",
36
+ "30": "Murcia",
37
+ "31": "Navarra",
38
+ "32": "Orense",
39
+ "33": "Asturias",
40
+ "34": "Palencia",
41
+ "35": "Las Palmas",
42
+ "36": "Pontevedra",
43
+ "37": "Salamanca",
44
+ "38": "Santa Cruz de Tenerife",
45
+ "39": "Cantabria",
46
+ "40": "Segovia",
47
+ "41": "Sevilla",
48
+ "42": "Soria",
49
+ "43": "Tarragona",
50
+ "44": "Teruel",
51
+ "45": "Toledo",
52
+ "46": "Valencia",
53
+ "47": "Valladolid",
54
+ "48": "Vizcaya",
55
+ "49": "Zamora",
56
+ "50": "Zaragoza",
57
+ "51": "Ceuta",
58
+ "52": "Melilla",
59
+ }
60
+
61
+ # Mapping of normalized names (for comparison) to standardized names (for storing)
62
+ # for each corresponding country code
63
+ STATES = {
64
+ "es": {
65
+ "andalucia": "Andalucía",
66
+ "aragon": "Aragón",
67
+ "asturias": "Asturias",
68
+ "baleares": "Baleares",
69
+ "canarias": "Canarias",
70
+ "cantabria": "Cantabria",
71
+ "castilla la mancha": "Castilla-La Mancha",
72
+ "castilla y leon": "Castilla y León",
73
+ "cataluna": "Cataluña",
74
+ "ceuta": "Ceuta",
75
+ "comunidad valenciana": "Comunidad Valenciana",
76
+ "extremadura": "Extremadura",
77
+ "galicia": "Galicia",
78
+ "la rioja": "La Rioja",
79
+ "madrid": "Comunidad de Madrid",
80
+ "melilla": "Melilla",
81
+ "murcia": "Murcia",
82
+ "navarra": "Navarra",
83
+ "pais vasco": "País Vasco",
84
+ "euskadi": "País Vasco", # Alias not caught by rapidfuzz
85
+ }
86
+ }
87
+
88
+ PROVINCES = {
89
+ "es": {
90
+ "alava": "Álava",
91
+ "araba": "Álava", # Alias not caught by rapidfuzz
92
+ "albacete": "Albacete",
93
+ "alicante": "Alicante",
94
+ "almeria": "Almería",
95
+ "asturias": "Asturias",
96
+ "avila": "Ávila",
97
+ "badajoz": "Badajoz",
98
+ "barcelona": "Barcelona",
99
+ "bizkaia": "Vizcaya",
100
+ "burgos": "Burgos",
101
+ "caceres": "Cáceres",
102
+ "cadiz": "Cádiz",
103
+ "cantabria": "Cantabria",
104
+ "castellon": "Castellón",
105
+ "ceuta": "Ceuta", # Considered province by opensm and/or geonames
106
+ "ciudad real": "Ciudad Real",
107
+ "cordoba": "Córdoba",
108
+ "cuenca": "Cuenca",
109
+ "gipuzkoa": "Gipuzkoa",
110
+ "gerona": "Gerona",
111
+ "granada": "Granada",
112
+ "guadalajara": "Guadalajara",
113
+ "huelva": "Huelva",
114
+ "huesca": "Huesca",
115
+ "islas baleares": "Islas Baleares",
116
+ "jaen": "Jaén",
117
+ "la coruna": "La Coruña",
118
+ "la rioja": "La Rioja",
119
+ "las palmas": "Las Palmas",
120
+ "leon": "León",
121
+ "lerida": "Lérida",
122
+ "lugo": "Lugo",
123
+ "madrid": "Madrid",
124
+ "malaga": "Málaga",
125
+ "melilla": "Melilla", # Considered province by opensm and/or geonames
126
+ "murcia": "Murcia",
127
+ "navarra": "Navarra",
128
+ "orense": "Orense",
129
+ "palencia": "Palencia",
130
+ "pontevedra": "Pontevedra",
131
+ "salamanca": "Salamanca",
132
+ "santa cruz de tenerife": "Santa Cruz de Tenerife",
133
+ "segovia": "Segovia",
134
+ "sevilla": "Sevilla",
135
+ "soria": "Soria",
136
+ "tarragona": "Tarragona",
137
+ "teruel": "Teruel",
138
+ "toledo": "Toledo",
139
+ "valencia": "Valencia",
140
+ "valladolid": "Valladolid",
141
+ "zamora": "Zamora",
142
+ "zaragoza": "Zaragoza",
143
+ }
144
+ }
145
+
146
+
147
+ PROVINCE_TO_POSTCODE = {
148
+ "es": {
149
+ "A Coruña": "15",
150
+ "Álava": "01",
151
+ "Araba": "01",
152
+ "Alacant": "03",
153
+ "Alicante": "03",
154
+ "Albacete": "02",
155
+ "Almería": "04",
156
+ "Asturias": "33",
157
+ "Ávila": "05",
158
+ "Badajoz": "06",
159
+ "Baleares": "07",
160
+ "Barcelona": "08",
161
+ "Bizkaia": "48",
162
+ "Burgos": "09",
163
+ "Cáceres": "10",
164
+ "Cádiz": "11",
165
+ "Cantabria": "39",
166
+ "Castelló": "12",
167
+ "Castellón": "12",
168
+ "Ceuta": "51",
169
+ "Ciudad Real": "13",
170
+ "Córdoba": "14",
171
+ "Cuenca": "16",
172
+ "Gerona": "17",
173
+ "Gipuzkoa": "20",
174
+ "Girona": "17",
175
+ "Granada": "18",
176
+ "Guadalajara": "19",
177
+ "Guipúzcoa": "20",
178
+ "Huelva": "21",
179
+ "Huesca": "22",
180
+ "Illes Balears": "07",
181
+ "Jaén": "23",
182
+ "La Coruña": "15",
183
+ "La Rioja": "26",
184
+ "Las Palmas": "35",
185
+ "León": "24",
186
+ "Lérida": "25",
187
+ "Lleida": "25",
188
+ "Lugo": "27",
189
+ "Madrid": "28",
190
+ "Málaga": "29",
191
+ "Melilla": "52",
192
+ "Murcia": "30",
193
+ "Navarra": "31",
194
+ "Orense": "32",
195
+ "Ourense": "32",
196
+ "Palencia": "34",
197
+ "Pontevedra": "36",
198
+ "Salamanca": "37",
199
+ "Santa Cruz de Tenerife": "38",
200
+ "Segovia": "40",
201
+ "Sevilla": "41",
202
+ "Soria": "42",
203
+ "Tarragona": "43",
204
+ "Teruel": "44",
205
+ "Toledo": "45",
206
+ "València": "46",
207
+ "Valencia": "46",
208
+ "Valladolid": "47",
209
+ "Vizcaya": "48",
210
+ "Zamora": "49",
211
+ "Zaragoza": "50",
212
+ },
213
+ "pt": {
214
+ "Aveiro": "3",
215
+ "Beja": "7",
216
+ "Braga": "4",
217
+ "Bragança": "5",
218
+ "Castelo Branco": "6",
219
+ "Coimbra": "3",
220
+ "Évora": "7",
221
+ "Faro": "8",
222
+ "Guarda": "6",
223
+ "Leiria": "2",
224
+ "Lisboa": "1",
225
+ "Portalegre": "7",
226
+ "Porto": "4",
227
+ "Santarém": "2",
228
+ "Setúbal": "2",
229
+ "Viana do Castelo": "4",
230
+ "Vila Real": "5",
231
+ "Viseu": "3",
232
+ "Açores": "9",
233
+ "Madeira": "9",
234
+ },
235
+ }
236
+
237
+
238
+ POSTCODE_TO_STATES = {
239
+ "es": {
240
+ # Andalucía
241
+ "04": "Andalucía",
242
+ "11": "Andalucía",
243
+ "14": "Andalucía",
244
+ "18": "Andalucía",
245
+ "21": "Andalucía",
246
+ "23": "Andalucía",
247
+ "29": "Andalucía",
248
+ "41": "Andalucía",
249
+ # Aragón
250
+ "22": "Aragón",
251
+ "44": "Aragón",
252
+ "50": "Aragón",
253
+ # Asturias
254
+ "33": "Principado de Asturias",
255
+ # Baleares
256
+ "07": "Islas Baleares",
257
+ # Canarias
258
+ "35": "Canarias",
259
+ "38": "Canarias",
260
+ # Cantabria
261
+ "39": "Cantabria",
262
+ # Castilla y León
263
+ "05": "Castilla y León",
264
+ "09": "Castilla y León",
265
+ "24": "Castilla y León",
266
+ "34": "Castilla y León",
267
+ "37": "Castilla y León",
268
+ "40": "Castilla y León",
269
+ "42": "Castilla y León",
270
+ "47": "Castilla y León",
271
+ "49": "Castilla y León",
272
+ # Castilla-La Mancha
273
+ "02": "Castilla-La Mancha",
274
+ "13": "Castilla-La Mancha",
275
+ "16": "Castilla-La Mancha",
276
+ "19": "Castilla-La Mancha",
277
+ "45": "Castilla-La Mancha",
278
+ # Cataluña
279
+ "08": "Cataluña",
280
+ "17": "Cataluña",
281
+ "25": "Cataluña",
282
+ "43": "Cataluña",
283
+ # Comunidad Valenciana
284
+ "03": "Comunidad Valenciana",
285
+ "12": "Comunidad Valenciana",
286
+ "46": "Comunidad Valenciana",
287
+ # Extremadura
288
+ "06": "Extremadura",
289
+ "10": "Extremadura",
290
+ # Galicia
291
+ "15": "Galicia",
292
+ "27": "Galicia",
293
+ "32": "Galicia",
294
+ "36": "Galicia",
295
+ # Madrid
296
+ "28": "Comunidad de Madrid",
297
+ # Murcia
298
+ "30": "Región de Murcia",
299
+ # Navarra
300
+ "31": "Comunidad Foral de Navarra",
301
+ # País Vasco
302
+ "01": "País Vasco",
303
+ "20": "País Vasco",
304
+ "48": "País Vasco",
305
+ # La Rioja
306
+ "26": "La Rioja",
307
+ # Ciudades Autónomas
308
+ "51": "Ceuta",
309
+ "52": "Melilla",
310
+ },
311
+ "pt": { # --- NORTE ---
312
+ "40": "Porto",
313
+ "41": "Porto",
314
+ "42": "Porto",
315
+ "43": "Porto",
316
+ "44": "Porto",
317
+ "45": "Aveiro", # Concelhos do norte de Aveiro, na fronteira com Porto.
318
+ "47": "Braga",
319
+ "48": "Braga", # Guimarães.
320
+ "49": "Viana do Castelo",
321
+ "50": "Vila Real",
322
+ "51": "Vila Real",
323
+ "52": "Vila Real",
324
+ "53": "Vila Real / Bragança", # Zona fronteiriça.
325
+ "54": "Bragança",
326
+ # --- CENTRO ---
327
+ "60": "Castelo Branco",
328
+ "61": "Castelo Branco",
329
+ "62": "Castelo Branco",
330
+ "63": "Guarda",
331
+ "30": "Coimbra",
332
+ "31": "Coimbra",
333
+ "32": "Coimbra",
334
+ "33": "Coimbra",
335
+ "34": "Viseu",
336
+ "35": "Viseu",
337
+ "37": "Aveiro",
338
+ "38": "Aveiro",
339
+ "24": "Leiria",
340
+ # --- ÁREA METROPOLITANA DE LISBOA e arredores ---
341
+ "10": "Lisboa",
342
+ "11": "Lisboa",
343
+ "12": "Lisboa",
344
+ "13": "Lisboa",
345
+ "14": "Lisboa",
346
+ "15": "Lisboa",
347
+ "16": "Lisboa",
348
+ "17": "Lisboa",
349
+ "18": "Lisboa",
350
+ "19": "Lisboa",
351
+ "20": "Santarém",
352
+ "21": "Santarém",
353
+ "22": "Santarém",
354
+ "23": "Santarém", # Tomar e Torres Novas.
355
+ "25": "Lisboa", # Concelhos como Torres Vedras, Mafra, Alenquer.
356
+ "26": "Lisboa", # Concelhos como Loures, Amadora, Odivelas.
357
+ "27": "Lisboa", # Concelhos como Sintra, Cascais, Oeiras.
358
+ "28": "Setúbal",
359
+ "29": "Setúbal",
360
+ # --- ALENTEJO ---
361
+ "70": "Évora",
362
+ "71": "Évora",
363
+ "72": "Évora",
364
+ "73": "Portalegre",
365
+ "74": "Portalegre",
366
+ "75": "Setúbal", # Litoral Alentejano (Sines, Grândola), administrativamente de Setúbal.
367
+ "76": "Beja",
368
+ "77": "Beja",
369
+ "78": "Beja",
370
+ "79": "Beja",
371
+ # --- ALGARVE ---
372
+ "80": "Faro",
373
+ "81": "Faro",
374
+ "82": "Faro",
375
+ "83": "Faro",
376
+ "84": "Faro",
377
+ "85": "Faro",
378
+ "86": "Faro",
379
+ "87": "Faro",
380
+ "88": "Faro",
381
+ "89": "Faro",
382
+ # --- REGIÕES AUTÓNOMAS ---
383
+ "90": "Madeira",
384
+ "91": "Madeira",
385
+ "92": "Madeira",
386
+ "93": "Madeira",
387
+ "95": "Açores", # Ilha de São Miguel (Ponta Delgada).
388
+ "96": "Açores", # Ilha de São Miguel (Ribeira Grande) e Santa Maria.
389
+ "97": "Açores", # Ilha Terceira (Angra do Heroísmo).
390
+ "98": "Açores", # Ilhas de São Jorge, Graciosa, Faial, Pico.
391
+ "99": "Açores", # Ilhas de Flores e Corvo.
392
+ },
393
+ }
394
+
395
+ _NORMALIZED_PROVINCE_CACHE = {}
396
+ for country, provinces in PROVINCE_TO_POSTCODE.items():
397
+ # Get the original keys (e.g., "A Coruña", "Álava")
398
+ original_keys = list(provinces.keys())
399
+
400
+ # Create the normalized list (e.g., "a coruna", "alava")
401
+ normalized_choices = [unidecode(p).lower() for p in original_keys]
402
+
403
+ _NORMALIZED_PROVINCE_CACHE[country] = {
404
+ "choices": normalized_choices, # The list for rapidfuzz to search in
405
+ "keys": original_keys # The list to find the name by index
406
+ }
407
+
408
+ # Source: https://github.com/ariankoochak/regex-patterns-of-all-countries
409
+ COUNTRY_PARSING_RULES = {
410
+ "es": {
411
+ "zip_validate_pattern": re.compile(r"^\d{5}$"),
412
+
413
+ "zip_search_pattern": re.compile(r"\b\d{5}\b"),
414
+
415
+ "phone_validate_pattern": re.compile(r"^(\+?34)?[6|7]\d{8}$")
416
+ },
417
+ "pt": {
418
+ "zip_validate_pattern": re.compile(r"^\d{4}[- ]{0,1}\d{3}$|^\d{4}$"),
419
+
420
+ "zip_search_pattern": re.compile(r"\b\d{4}[- ]?\d{3}\b|\b\d{4}\b"),
421
+
422
+ "phone_validate_pattern": re.compile(r"^(\+?351)?9[1236]\d{7}$")
423
+ }
424
+ }
425
+
426
+ # Cutoff score for rapidfuzz in the name standardization function
427
+ STANDARD_THRESHOLD = 40
@@ -0,0 +1,204 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ from typing import Literal, Optional
5
+ from rapidfuzz import fuzz, process
6
+ from unidecode import unidecode
7
+ from ..params.nominatim import (
8
+ POSTCODE_TO_STATES,
9
+ PROVINCE_TO_POSTCODE,
10
+ PROVINCES,
11
+ STANDARD_THRESHOLD,
12
+ STATES,
13
+ _NORMALIZED_PROVINCE_CACHE,
14
+ COUNTRY_PARSING_RULES
15
+ )
16
+ from .strings import normalize
17
+
18
+ ########################################################################################################################
19
+ # FUNCTIONS
20
+
21
+
22
+ def standardize_admin_division(
23
+ name: str,
24
+ level: Literal["province", "state"] = "province",
25
+ country_code: str = "es",
26
+ ) -> Optional[str]:
27
+ """
28
+ Normalize and standardize administrative divisions of a given country using RapidFuzz.
29
+ Uses normalized dict keys for comparison and returns dict values with the official names.
30
+ """
31
+ if not name:
32
+ return None
33
+
34
+ country_code = country_code.lower()
35
+ mapping = (
36
+ STATES.get(country_code) if level == "state" else PROVINCES.get(country_code)
37
+ )
38
+
39
+ if not mapping: # If country is not standardized, return raw name
40
+ return name
41
+
42
+ normalized_name = normalize(name) # Essential for rapidfuzz to work well
43
+ result = process.extractOne(
44
+ normalized_name,
45
+ mapping.keys(), # Compare with the normalized names in the dict
46
+ scorer=fuzz.WRatio,
47
+ score_cutoff=STANDARD_THRESHOLD,
48
+ )
49
+
50
+ if not result:
51
+ return None
52
+
53
+ best_key, score, _ = result
54
+
55
+ # Return the standardized name corresponding to the normalized name
56
+ return mapping[best_key]
57
+
58
+
59
+ def parse_state(
60
+ zip_code: str,
61
+ country_code: str,
62
+ ) -> str | None:
63
+ """Given a zip code and a country code, returns the state in which the zip code is located
64
+
65
+ Args:
66
+ zip_code (str)
67
+ country_code (str)
68
+
69
+ Returns:
70
+ str | None: state if coincidence found, else None
71
+ """
72
+ country_postcodes = POSTCODE_TO_STATES.get(country_code, {})
73
+ state = country_postcodes.get(zip_code[:2], None)
74
+ return state
75
+
76
+
77
+ def _province_postcode_match(
78
+ address: str,
79
+ zip_code: str,
80
+ country_code: str,
81
+ ) -> str | None:
82
+ """
83
+ Match and return province with the start of all of its zip codes
84
+ using a pre-computed cache and rapidfuzz for efficient matching.
85
+
86
+ Args:
87
+ address (str)
88
+ zip_code (str)
89
+ country_code (str)
90
+
91
+ Returns:
92
+ str | None:
93
+ """
94
+ # Get the pre-computed cache for the country
95
+ cache = _NORMALIZED_PROVINCE_CACHE.get(country_code)
96
+ if not cache:
97
+ return None # Country not configured
98
+
99
+ normalized_address = unidecode(address).lower()
100
+
101
+ # Use the cached 'choices' list for the search
102
+ result = process.extractOne(
103
+ normalized_address,
104
+ cache["choices"], # <-- Uses pre-computed list
105
+ scorer=fuzz.partial_ratio,
106
+ score_cutoff=100
107
+ )
108
+
109
+ if not result:
110
+ return None # No exact substring match found
111
+
112
+ # We only need the index from the result
113
+ _, _, index = result
114
+
115
+ # Get the original province name from the cached 'keys' list
116
+ original_province = cache["keys"][index] # <-- Uses pre-computed list
117
+
118
+ # Get the postcode prefix from the original map
119
+ province_map = PROVINCE_TO_POSTCODE.get(country_code, {})
120
+ postcode_prefix = province_map[original_province]
121
+
122
+ return (
123
+ postcode_prefix + zip_code[1:]
124
+ if len(zip_code) == 4
125
+ else zip_code
126
+ )
127
+
128
+ def _parse_es_zip_code(
129
+ zip_code: str,
130
+ address: str,
131
+ opt_address: str | None,
132
+ ) -> str:
133
+ """parse spain zip code"""
134
+
135
+ # Get the validation regex from params
136
+ validate_regex = COUNTRY_PARSING_RULES['es']['zip_validate_pattern']
137
+
138
+ if validate_regex.match(zip_code):
139
+ return zip_code
140
+ else:
141
+ # Use search regex from params
142
+ pattern = COUNTRY_PARSING_RULES['es']['zip_search_pattern']
143
+
144
+ match = pattern.search(address)
145
+ if match:
146
+ return match.group()
147
+ if opt_address:
148
+ match = pattern.search(opt_address)
149
+ if match:
150
+ return match.group()
151
+
152
+ province_match = _province_postcode_match(address, zip_code, country_code="es")
153
+ return province_match or zip_code
154
+
155
+
156
+ def _parse_pt_zip_code(
157
+ zip_code: str,
158
+ address: str,
159
+ opt_address: str | None,
160
+ ) -> str:
161
+ """parse portugal zip code"""
162
+
163
+ # Get the validation regex from params
164
+ validate_regex = COUNTRY_PARSING_RULES['pt']['zip_validate_pattern']
165
+
166
+ if validate_regex.match(zip_code):
167
+ return zip_code
168
+ else:
169
+ # Use search regex from params
170
+ pattern = COUNTRY_PARSING_RULES['pt']['zip_search_pattern']
171
+
172
+ match = pattern.search(address)
173
+ if match is None and opt_address:
174
+ match = pattern.search(opt_address)
175
+
176
+ return match.group() if match else zip_code
177
+
178
+
179
+ def parse_zip_code(
180
+ address: str,
181
+ zip_code: str,
182
+ country_code: str,
183
+ opt_address: str | None = None,
184
+ ) -> str | None:
185
+ """Parse and standardize zip code
186
+
187
+ Args:
188
+ address (str): written address
189
+ zip_code (str)
190
+ country_code (str):
191
+ opt_address (str | None, optional): optional extra address, usually None. Defaults to None.
192
+
193
+ Raises:
194
+ ValueError: when parsing zip code is not supported for the passed country_code
195
+
196
+ Returns:
197
+ str | None
198
+ """
199
+ if country_code == "es":
200
+ return _parse_es_zip_code(zip_code, address, opt_address)
201
+ elif country_code == "pt":
202
+ return _parse_pt_zip_code(zip_code, address, opt_address)
203
+ else:
204
+ raise ValueError(f"Country code ({country_code}) is not currently supported")
@@ -1,10 +1,9 @@
1
1
  ########################################################################################################################
2
2
  # IMPORTS
3
-
3
+ import re
4
4
  import unicodedata
5
5
  from enum import Enum, auto
6
6
  from typing import Any, Optional, Set, Union
7
-
8
7
  import numpy as np
9
8
  from inflection import camelize, parameterize, titleize, underscore
10
9
  from string_utils import prettify, strip_html
@@ -37,7 +36,9 @@ class NamingConvention(Enum):
37
36
  # FUNCTIONS
38
37
 
39
38
 
40
- def get_unidecoded_text(input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False) -> str:
39
+ def get_unidecoded_text(
40
+ input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False
41
+ ) -> str:
41
42
  """
42
43
  Processes a string by unidecoding characters, optionally lowercasing them,
43
44
  while preserving a specified set of allowed characters.
@@ -64,7 +65,9 @@ def get_unidecoded_text(input_text: str, allowed_chars: Set[str], apply_lowercas
64
65
  return "".join(chars_list)
65
66
 
66
67
 
67
- def transliterate_symbols(s: str, allowed_symbols_set: Optional[Set[str]] = None) -> str:
68
+ def transliterate_symbols(
69
+ s: str, allowed_symbols_set: Optional[Set[str]] = None
70
+ ) -> str:
68
71
  """
69
72
  Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
70
73
  with spaces replaced by underscores. Other characters, or characters in allowed_symbols_set, remain unchanged.
@@ -179,7 +182,9 @@ def normalize(
179
182
 
180
183
  for c in intermediate_text:
181
184
  cat = unicodedata.category(c)
182
- if c in _allowed_symbols_set or c.isalnum(): # Allowed symbols are part of tokens
185
+ if (
186
+ c in _allowed_symbols_set or c.isalnum()
187
+ ): # Allowed symbols are part of tokens
183
188
  current_token_chars.append(c)
184
189
  elif mode is NormalizationMode.FULL and cat.startswith("S"):
185
190
  # Transliterate S* category symbols not in allowed_symbols
@@ -0,0 +1,69 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import re
5
+ from typing import Literal
6
+ from ...params.nominatim import COUNTRY_PARSING_RULES
7
+
8
+ ########################################################################################################################
9
+ # FUNCTIONS
10
+
11
+
12
+ def _standardize_es_phone_number(number: str) -> str | None:
13
+ """Standardize phone numbers from Spain using regex validation.
14
+
15
+ Args:
16
+ number (str): cleaned, digits-only phone number
17
+
18
+ Returns:
19
+ str | None: standardized 9-digit phone number
20
+ """
21
+ # Get the validation regex from params
22
+ pattern = COUNTRY_PARSING_RULES["es"]["phone_validate_pattern"]
23
+
24
+ # Validate and extract in one step
25
+ match = pattern.match(number)
26
+
27
+ # Return the captured group (the 9-digit number)
28
+ return match.group(1) if match else None
29
+
30
+
31
+ def _standardize_pt_phone_number(number: str) -> str | None:
32
+ """Standardize phone numbers from Portugal using regex validation.
33
+
34
+ Args:
35
+ number (str): cleaned, digits-only phone number
36
+
37
+ Returns:
38
+ str | None: standardized 9-digit phone number
39
+ """
40
+ # Get the validation regex from params
41
+ pattern = COUNTRY_PARSING_RULES["pt"]["phone_validate_pattern"]
42
+
43
+ # Validate and extract in one step
44
+ match = pattern.match(number)
45
+
46
+ # Return the captured group (the 9-digit number)
47
+ return match.group(1) if match else None
48
+
49
+
50
+ def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str | None:
51
+ """Clean and standardize phone number from a certain country_code
52
+
53
+ Args:
54
+ number (str): phone number
55
+ country_code (Literal["es", "pt"]): country code of the phone number to parse
56
+
57
+ Raises:
58
+ ValueError: when parsing is not supported for a certain country
59
+
60
+ Returns:
61
+ str | None: standardized phone number
62
+ """
63
+ clean_number = re.sub(r"\D", "", number)
64
+ if country_code == "es":
65
+ return _standardize_es_phone_number(clean_number)
66
+ elif country_code == "pt":
67
+ return _standardize_pt_phone_number(clean_number)
68
+ else:
69
+ raise ValueError(f"Country code ({country_code}) is not currently supported")
@@ -1,144 +0,0 @@
1
- CITY_TO_PROVINCE = {"Madrid": "Madrid"}
2
-
3
- POSTCODES = {
4
- "01": "Álava",
5
- "02": "Albacete",
6
- "03": "Alicante",
7
- "04": "Almería",
8
- "05": "Ávila",
9
- "06": "Badajoz",
10
- "07": "Baleares",
11
- "08": "Barcelona",
12
- "09": "Burgos",
13
- "10": "Cáceres",
14
- "11": "Cádiz",
15
- "12": "Castellón",
16
- "13": "Ciudad Real",
17
- "14": "Córdoba",
18
- "15": "La Coruña",
19
- "16": "Cuenca",
20
- "17": "Gerona",
21
- "18": "Granada",
22
- "19": "Guadalajara",
23
- "20": "Guipúzcoa",
24
- "21": "Huelva",
25
- "22": "Huesca",
26
- "23": "Jaén",
27
- "24": "León",
28
- "25": "Lérida",
29
- "26": "La Rioja",
30
- "27": "Lugo",
31
- "28": "Madrid",
32
- "29": "Málaga",
33
- "30": "Murcia",
34
- "31": "Navarra",
35
- "32": "Orense",
36
- "33": "Asturias",
37
- "34": "Palencia",
38
- "35": "Las Palmas",
39
- "36": "Pontevedra",
40
- "37": "Salamanca",
41
- "38": "Santa Cruz de Tenerife",
42
- "39": "Cantabria",
43
- "40": "Segovia",
44
- "41": "Sevilla",
45
- "42": "Soria",
46
- "43": "Tarragona",
47
- "44": "Teruel",
48
- "45": "Toledo",
49
- "46": "Valencia",
50
- "47": "Valladolid",
51
- "48": "Vizcaya",
52
- "49": "Zamora",
53
- "50": "Zaragoza",
54
- "51": "Ceuta",
55
- "52": "Melilla",
56
- }
57
-
58
- # Mapping of normalized names (for comparison) to standardized names (for storing)
59
- # for each corresponding country code
60
- STATES = {
61
- "es": {
62
- "andalucia": "Andalucía",
63
- "aragon": "Aragón",
64
- "asturias": "Asturias",
65
- "baleares": "Baleares",
66
- "canarias": "Canarias",
67
- "cantabria": "Cantabria",
68
- "castilla la mancha": "Castilla-La Mancha",
69
- "castilla y leon": "Castilla y León",
70
- "cataluna": "Cataluña",
71
- "ceuta": "Ceuta",
72
- "comunidad valenciana": "Comunidad Valenciana",
73
- "extremadura": "Extremadura",
74
- "galicia": "Galicia",
75
- "la rioja": "La Rioja",
76
- "madrid": "Comunidad de Madrid",
77
- "melilla": "Melilla",
78
- "murcia": "Murcia",
79
- "navarra": "Navarra",
80
- "pais vasco": "País Vasco",
81
- "euskadi": "País Vasco", # Alias not caught by rapidfuzz
82
- }
83
- }
84
-
85
- PROVINCES = {
86
- "es": {
87
- "alava": "Álava",
88
- "araba": "Álava", # Alias not caught by rapidfuzz
89
- "albacete": "Albacete",
90
- "alicante": "Alicante",
91
- "almeria": "Almería",
92
- "asturias": "Asturias",
93
- "avila": "Ávila",
94
- "badajoz": "Badajoz",
95
- "barcelona": "Barcelona",
96
- "bizkaia": "Vizcaya",
97
- "burgos": "Burgos",
98
- "caceres": "Cáceres",
99
- "cadiz": "Cádiz",
100
- "cantabria": "Cantabria",
101
- "castellon": "Castellón",
102
- "ceuta": "Ceuta", # Considered province by opensm and/or geonames
103
- "ciudad real": "Ciudad Real",
104
- "cordoba": "Córdoba",
105
- "cuenca": "Cuenca",
106
- "gipuzkoa": "Gipuzkoa",
107
- "gerona": "Gerona",
108
- "granada": "Granada",
109
- "guadalajara": "Guadalajara",
110
- "huelva": "Huelva",
111
- "huesca": "Huesca",
112
- "islas baleares": "Islas Baleares",
113
- "jaen": "Jaén",
114
- "la coruna": "La Coruña",
115
- "la rioja": "La Rioja",
116
- "las palmas": "Las Palmas",
117
- "leon": "León",
118
- "lerida": "Lérida",
119
- "lugo": "Lugo",
120
- "madrid": "Madrid",
121
- "malaga": "Málaga",
122
- "melilla": "Melilla", # Considered province by opensm and/or geonames
123
- "murcia": "Murcia",
124
- "navarra": "Navarra",
125
- "orense": "Orense",
126
- "palencia": "Palencia",
127
- "pontevedra": "Pontevedra",
128
- "salamanca": "Salamanca",
129
- "santa cruz de tenerife": "Santa Cruz de Tenerife",
130
- "segovia": "Segovia",
131
- "sevilla": "Sevilla",
132
- "soria": "Soria",
133
- "tarragona": "Tarragona",
134
- "teruel": "Teruel",
135
- "toledo": "Toledo",
136
- "valencia": "Valencia",
137
- "valladolid": "Valladolid",
138
- "zamora": "Zamora",
139
- "zaragoza": "Zaragoza",
140
- }
141
- }
142
-
143
- # Cutoff score for rapidfuzz in the name standardization function
144
- STANDARD_THRESHOLD = 40
@@ -1,38 +0,0 @@
1
- from typing import Optional, Literal
2
- from rapidfuzz import fuzz, process
3
- from ..params.nominatim import STATES, PROVINCES, STANDARD_THRESHOLD
4
- from .strings import normalize
5
-
6
- def standardize_admin_division(
7
- name: str,
8
- level: Literal["province", "state"] = "province",
9
- country_code: str = "es"
10
- ) -> Optional[str]:
11
- """
12
- Normalize and standardize administrative divisions of a given country using RapidFuzz.
13
- Uses normalized dict keys for comparison and returns dict values with the official names.
14
- """
15
- if not name:
16
- return None
17
-
18
- country_code = country_code.lower()
19
- mapping = STATES.get(country_code) if level == "state" else PROVINCES.get(country_code)
20
-
21
- if not mapping: # If country is not standardized, return raw name
22
- return name
23
-
24
- normalized_name = normalize(name) # Essential for rapidfuzz to work well
25
- result = process.extractOne(
26
- normalized_name,
27
- mapping.keys(), # Compare with the normalized names in the dict
28
- scorer=fuzz.WRatio,
29
- score_cutoff=STANDARD_THRESHOLD,
30
- )
31
-
32
- if not result:
33
- return None
34
-
35
- best_key, score, _ = result
36
-
37
- # Return the standardized name corresponding to the normalized name
38
- return mapping[best_key]
File without changes
File without changes