datamarket 0.7.97__py3-none-any.whl → 0.7.98__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/interfaces/alchemy.py +4 -1
- datamarket/params/nominatim.py +288 -5
- datamarket/utils/nominatim.py +175 -9
- datamarket/utils/strings/normalization.py +10 -5
- datamarket/utils/strings/standardization.py +69 -0
- {datamarket-0.7.97.dist-info → datamarket-0.7.98.dist-info}/METADATA +1 -1
- {datamarket-0.7.97.dist-info → datamarket-0.7.98.dist-info}/RECORD +9 -8
- {datamarket-0.7.97.dist-info → datamarket-0.7.98.dist-info}/LICENSE +0 -0
- {datamarket-0.7.97.dist-info → datamarket-0.7.98.dist-info}/WHEEL +0 -0
datamarket/interfaces/alchemy.py
CHANGED
|
@@ -215,7 +215,10 @@ class AlchemyInterface:
|
|
|
215
215
|
label = PG_ERROR_LABELS.get(code, "Integrity error (unspecified)")
|
|
216
216
|
|
|
217
217
|
# Log one clean message with trace + the raw DB message separately
|
|
218
|
-
|
|
218
|
+
if code == "23505": # A simple info log for unique violations
|
|
219
|
+
logger.info(f"{label} trying to {action} {alchemy_obj}")
|
|
220
|
+
else:
|
|
221
|
+
logger.error(f"{label} trying to {action} {alchemy_obj}\nPostgreSQL message: {ex.orig}")
|
|
219
222
|
|
|
220
223
|
|
|
221
224
|
def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> bool:
|
datamarket/params/nominatim.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from unidecode import unidecode
|
|
2
|
+
import re
|
|
3
|
+
|
|
1
4
|
CITY_TO_PROVINCE = {"Madrid": "Madrid"}
|
|
2
5
|
|
|
3
6
|
POSTCODES = {
|
|
@@ -78,14 +81,14 @@ STATES = {
|
|
|
78
81
|
"murcia": "Murcia",
|
|
79
82
|
"navarra": "Navarra",
|
|
80
83
|
"pais vasco": "País Vasco",
|
|
81
|
-
"euskadi": "País Vasco",
|
|
84
|
+
"euskadi": "País Vasco", # Alias not caught by rapidfuzz
|
|
82
85
|
}
|
|
83
86
|
}
|
|
84
87
|
|
|
85
88
|
PROVINCES = {
|
|
86
89
|
"es": {
|
|
87
90
|
"alava": "Álava",
|
|
88
|
-
"araba": "Álava",
|
|
91
|
+
"araba": "Álava", # Alias not caught by rapidfuzz
|
|
89
92
|
"albacete": "Albacete",
|
|
90
93
|
"alicante": "Alicante",
|
|
91
94
|
"almeria": "Almería",
|
|
@@ -99,7 +102,7 @@ PROVINCES = {
|
|
|
99
102
|
"cadiz": "Cádiz",
|
|
100
103
|
"cantabria": "Cantabria",
|
|
101
104
|
"castellon": "Castellón",
|
|
102
|
-
"ceuta": "Ceuta",
|
|
105
|
+
"ceuta": "Ceuta", # Considered province by opensm and/or geonames
|
|
103
106
|
"ciudad real": "Ciudad Real",
|
|
104
107
|
"cordoba": "Córdoba",
|
|
105
108
|
"cuenca": "Cuenca",
|
|
@@ -119,7 +122,7 @@ PROVINCES = {
|
|
|
119
122
|
"lugo": "Lugo",
|
|
120
123
|
"madrid": "Madrid",
|
|
121
124
|
"malaga": "Málaga",
|
|
122
|
-
"melilla": "Melilla",
|
|
125
|
+
"melilla": "Melilla", # Considered province by opensm and/or geonames
|
|
123
126
|
"murcia": "Murcia",
|
|
124
127
|
"navarra": "Navarra",
|
|
125
128
|
"orense": "Orense",
|
|
@@ -140,5 +143,285 @@ PROVINCES = {
|
|
|
140
143
|
}
|
|
141
144
|
}
|
|
142
145
|
|
|
146
|
+
|
|
147
|
+
PROVINCE_TO_POSTCODE = {
|
|
148
|
+
"es": {
|
|
149
|
+
"A Coruña": "15",
|
|
150
|
+
"Álava": "01",
|
|
151
|
+
"Araba": "01",
|
|
152
|
+
"Alacant": "03",
|
|
153
|
+
"Alicante": "03",
|
|
154
|
+
"Albacete": "02",
|
|
155
|
+
"Almería": "04",
|
|
156
|
+
"Asturias": "33",
|
|
157
|
+
"Ávila": "05",
|
|
158
|
+
"Badajoz": "06",
|
|
159
|
+
"Baleares": "07",
|
|
160
|
+
"Barcelona": "08",
|
|
161
|
+
"Bizkaia": "48",
|
|
162
|
+
"Burgos": "09",
|
|
163
|
+
"Cáceres": "10",
|
|
164
|
+
"Cádiz": "11",
|
|
165
|
+
"Cantabria": "39",
|
|
166
|
+
"Castelló": "12",
|
|
167
|
+
"Castellón": "12",
|
|
168
|
+
"Ceuta": "51",
|
|
169
|
+
"Ciudad Real": "13",
|
|
170
|
+
"Córdoba": "14",
|
|
171
|
+
"Cuenca": "16",
|
|
172
|
+
"Gerona": "17",
|
|
173
|
+
"Gipuzkoa": "20",
|
|
174
|
+
"Girona": "17",
|
|
175
|
+
"Granada": "18",
|
|
176
|
+
"Guadalajara": "19",
|
|
177
|
+
"Guipúzcoa": "20",
|
|
178
|
+
"Huelva": "21",
|
|
179
|
+
"Huesca": "22",
|
|
180
|
+
"Illes Balears": "07",
|
|
181
|
+
"Jaén": "23",
|
|
182
|
+
"La Coruña": "15",
|
|
183
|
+
"La Rioja": "26",
|
|
184
|
+
"Las Palmas": "35",
|
|
185
|
+
"León": "24",
|
|
186
|
+
"Lérida": "25",
|
|
187
|
+
"Lleida": "25",
|
|
188
|
+
"Lugo": "27",
|
|
189
|
+
"Madrid": "28",
|
|
190
|
+
"Málaga": "29",
|
|
191
|
+
"Melilla": "52",
|
|
192
|
+
"Murcia": "30",
|
|
193
|
+
"Navarra": "31",
|
|
194
|
+
"Orense": "32",
|
|
195
|
+
"Ourense": "32",
|
|
196
|
+
"Palencia": "34",
|
|
197
|
+
"Pontevedra": "36",
|
|
198
|
+
"Salamanca": "37",
|
|
199
|
+
"Santa Cruz de Tenerife": "38",
|
|
200
|
+
"Segovia": "40",
|
|
201
|
+
"Sevilla": "41",
|
|
202
|
+
"Soria": "42",
|
|
203
|
+
"Tarragona": "43",
|
|
204
|
+
"Teruel": "44",
|
|
205
|
+
"Toledo": "45",
|
|
206
|
+
"València": "46",
|
|
207
|
+
"Valencia": "46",
|
|
208
|
+
"Valladolid": "47",
|
|
209
|
+
"Vizcaya": "48",
|
|
210
|
+
"Zamora": "49",
|
|
211
|
+
"Zaragoza": "50",
|
|
212
|
+
},
|
|
213
|
+
"pt": {
|
|
214
|
+
"Aveiro": "3",
|
|
215
|
+
"Beja": "7",
|
|
216
|
+
"Braga": "4",
|
|
217
|
+
"Bragança": "5",
|
|
218
|
+
"Castelo Branco": "6",
|
|
219
|
+
"Coimbra": "3",
|
|
220
|
+
"Évora": "7",
|
|
221
|
+
"Faro": "8",
|
|
222
|
+
"Guarda": "6",
|
|
223
|
+
"Leiria": "2",
|
|
224
|
+
"Lisboa": "1",
|
|
225
|
+
"Portalegre": "7",
|
|
226
|
+
"Porto": "4",
|
|
227
|
+
"Santarém": "2",
|
|
228
|
+
"Setúbal": "2",
|
|
229
|
+
"Viana do Castelo": "4",
|
|
230
|
+
"Vila Real": "5",
|
|
231
|
+
"Viseu": "3",
|
|
232
|
+
"Açores": "9",
|
|
233
|
+
"Madeira": "9",
|
|
234
|
+
},
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
POSTCODE_TO_STATES = {
|
|
239
|
+
"es": {
|
|
240
|
+
# Andalucía
|
|
241
|
+
"04": "Andalucía",
|
|
242
|
+
"11": "Andalucía",
|
|
243
|
+
"14": "Andalucía",
|
|
244
|
+
"18": "Andalucía",
|
|
245
|
+
"21": "Andalucía",
|
|
246
|
+
"23": "Andalucía",
|
|
247
|
+
"29": "Andalucía",
|
|
248
|
+
"41": "Andalucía",
|
|
249
|
+
# Aragón
|
|
250
|
+
"22": "Aragón",
|
|
251
|
+
"44": "Aragón",
|
|
252
|
+
"50": "Aragón",
|
|
253
|
+
# Asturias
|
|
254
|
+
"33": "Principado de Asturias",
|
|
255
|
+
# Baleares
|
|
256
|
+
"07": "Islas Baleares",
|
|
257
|
+
# Canarias
|
|
258
|
+
"35": "Canarias",
|
|
259
|
+
"38": "Canarias",
|
|
260
|
+
# Cantabria
|
|
261
|
+
"39": "Cantabria",
|
|
262
|
+
# Castilla y León
|
|
263
|
+
"05": "Castilla y León",
|
|
264
|
+
"09": "Castilla y León",
|
|
265
|
+
"24": "Castilla y León",
|
|
266
|
+
"34": "Castilla y León",
|
|
267
|
+
"37": "Castilla y León",
|
|
268
|
+
"40": "Castilla y León",
|
|
269
|
+
"42": "Castilla y León",
|
|
270
|
+
"47": "Castilla y León",
|
|
271
|
+
"49": "Castilla y León",
|
|
272
|
+
# Castilla-La Mancha
|
|
273
|
+
"02": "Castilla-La Mancha",
|
|
274
|
+
"13": "Castilla-La Mancha",
|
|
275
|
+
"16": "Castilla-La Mancha",
|
|
276
|
+
"19": "Castilla-La Mancha",
|
|
277
|
+
"45": "Castilla-La Mancha",
|
|
278
|
+
# Cataluña
|
|
279
|
+
"08": "Cataluña",
|
|
280
|
+
"17": "Cataluña",
|
|
281
|
+
"25": "Cataluña",
|
|
282
|
+
"43": "Cataluña",
|
|
283
|
+
# Comunidad Valenciana
|
|
284
|
+
"03": "Comunidad Valenciana",
|
|
285
|
+
"12": "Comunidad Valenciana",
|
|
286
|
+
"46": "Comunidad Valenciana",
|
|
287
|
+
# Extremadura
|
|
288
|
+
"06": "Extremadura",
|
|
289
|
+
"10": "Extremadura",
|
|
290
|
+
# Galicia
|
|
291
|
+
"15": "Galicia",
|
|
292
|
+
"27": "Galicia",
|
|
293
|
+
"32": "Galicia",
|
|
294
|
+
"36": "Galicia",
|
|
295
|
+
# Madrid
|
|
296
|
+
"28": "Comunidad de Madrid",
|
|
297
|
+
# Murcia
|
|
298
|
+
"30": "Región de Murcia",
|
|
299
|
+
# Navarra
|
|
300
|
+
"31": "Comunidad Foral de Navarra",
|
|
301
|
+
# País Vasco
|
|
302
|
+
"01": "País Vasco",
|
|
303
|
+
"20": "País Vasco",
|
|
304
|
+
"48": "País Vasco",
|
|
305
|
+
# La Rioja
|
|
306
|
+
"26": "La Rioja",
|
|
307
|
+
# Ciudades Autónomas
|
|
308
|
+
"51": "Ceuta",
|
|
309
|
+
"52": "Melilla",
|
|
310
|
+
},
|
|
311
|
+
"pt": { # --- NORTE ---
|
|
312
|
+
"40": "Porto",
|
|
313
|
+
"41": "Porto",
|
|
314
|
+
"42": "Porto",
|
|
315
|
+
"43": "Porto",
|
|
316
|
+
"44": "Porto",
|
|
317
|
+
"45": "Aveiro", # Concelhos do norte de Aveiro, na fronteira com Porto.
|
|
318
|
+
"47": "Braga",
|
|
319
|
+
"48": "Braga", # Guimarães.
|
|
320
|
+
"49": "Viana do Castelo",
|
|
321
|
+
"50": "Vila Real",
|
|
322
|
+
"51": "Vila Real",
|
|
323
|
+
"52": "Vila Real",
|
|
324
|
+
"53": "Vila Real / Bragança", # Zona fronteiriça.
|
|
325
|
+
"54": "Bragança",
|
|
326
|
+
# --- CENTRO ---
|
|
327
|
+
"60": "Castelo Branco",
|
|
328
|
+
"61": "Castelo Branco",
|
|
329
|
+
"62": "Castelo Branco",
|
|
330
|
+
"63": "Guarda",
|
|
331
|
+
"30": "Coimbra",
|
|
332
|
+
"31": "Coimbra",
|
|
333
|
+
"32": "Coimbra",
|
|
334
|
+
"33": "Coimbra",
|
|
335
|
+
"34": "Viseu",
|
|
336
|
+
"35": "Viseu",
|
|
337
|
+
"37": "Aveiro",
|
|
338
|
+
"38": "Aveiro",
|
|
339
|
+
"24": "Leiria",
|
|
340
|
+
# --- ÁREA METROPOLITANA DE LISBOA e arredores ---
|
|
341
|
+
"10": "Lisboa",
|
|
342
|
+
"11": "Lisboa",
|
|
343
|
+
"12": "Lisboa",
|
|
344
|
+
"13": "Lisboa",
|
|
345
|
+
"14": "Lisboa",
|
|
346
|
+
"15": "Lisboa",
|
|
347
|
+
"16": "Lisboa",
|
|
348
|
+
"17": "Lisboa",
|
|
349
|
+
"18": "Lisboa",
|
|
350
|
+
"19": "Lisboa",
|
|
351
|
+
"20": "Santarém",
|
|
352
|
+
"21": "Santarém",
|
|
353
|
+
"22": "Santarém",
|
|
354
|
+
"23": "Santarém", # Tomar e Torres Novas.
|
|
355
|
+
"25": "Lisboa", # Concelhos como Torres Vedras, Mafra, Alenquer.
|
|
356
|
+
"26": "Lisboa", # Concelhos como Loures, Amadora, Odivelas.
|
|
357
|
+
"27": "Lisboa", # Concelhos como Sintra, Cascais, Oeiras.
|
|
358
|
+
"28": "Setúbal",
|
|
359
|
+
"29": "Setúbal",
|
|
360
|
+
# --- ALENTEJO ---
|
|
361
|
+
"70": "Évora",
|
|
362
|
+
"71": "Évora",
|
|
363
|
+
"72": "Évora",
|
|
364
|
+
"73": "Portalegre",
|
|
365
|
+
"74": "Portalegre",
|
|
366
|
+
"75": "Setúbal", # Litoral Alentejano (Sines, Grândola), administrativamente de Setúbal.
|
|
367
|
+
"76": "Beja",
|
|
368
|
+
"77": "Beja",
|
|
369
|
+
"78": "Beja",
|
|
370
|
+
"79": "Beja",
|
|
371
|
+
# --- ALGARVE ---
|
|
372
|
+
"80": "Faro",
|
|
373
|
+
"81": "Faro",
|
|
374
|
+
"82": "Faro",
|
|
375
|
+
"83": "Faro",
|
|
376
|
+
"84": "Faro",
|
|
377
|
+
"85": "Faro",
|
|
378
|
+
"86": "Faro",
|
|
379
|
+
"87": "Faro",
|
|
380
|
+
"88": "Faro",
|
|
381
|
+
"89": "Faro",
|
|
382
|
+
# --- REGIÕES AUTÓNOMAS ---
|
|
383
|
+
"90": "Madeira",
|
|
384
|
+
"91": "Madeira",
|
|
385
|
+
"92": "Madeira",
|
|
386
|
+
"93": "Madeira",
|
|
387
|
+
"95": "Açores", # Ilha de São Miguel (Ponta Delgada).
|
|
388
|
+
"96": "Açores", # Ilha de São Miguel (Ribeira Grande) e Santa Maria.
|
|
389
|
+
"97": "Açores", # Ilha Terceira (Angra do Heroísmo).
|
|
390
|
+
"98": "Açores", # Ilhas de São Jorge, Graciosa, Faial, Pico.
|
|
391
|
+
"99": "Açores", # Ilhas de Flores e Corvo.
|
|
392
|
+
},
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
_NORMALIZED_PROVINCE_CACHE = {}
|
|
396
|
+
for country, provinces in PROVINCE_TO_POSTCODE.items():
|
|
397
|
+
# Get the original keys (e.g., "A Coruña", "Álava")
|
|
398
|
+
original_keys = list(provinces.keys())
|
|
399
|
+
|
|
400
|
+
# Create the normalized list (e.g., "a coruna", "alava")
|
|
401
|
+
normalized_choices = [unidecode(p).lower() for p in original_keys]
|
|
402
|
+
|
|
403
|
+
_NORMALIZED_PROVINCE_CACHE[country] = {
|
|
404
|
+
"choices": normalized_choices, # The list for rapidfuzz to search in
|
|
405
|
+
"keys": original_keys # The list to find the name by index
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
# Source: https://github.com/ariankoochak/regex-patterns-of-all-countries
|
|
409
|
+
COUNTRY_PARSING_RULES = {
|
|
410
|
+
"es": {
|
|
411
|
+
"zip_validate_pattern": re.compile(r"^\d{5}$"),
|
|
412
|
+
|
|
413
|
+
"zip_search_pattern": re.compile(r"\b\d{5}\b"),
|
|
414
|
+
|
|
415
|
+
"phone_validate_pattern": re.compile(r"^(\+?34)?[6|7]\d{8}$")
|
|
416
|
+
},
|
|
417
|
+
"pt": {
|
|
418
|
+
"zip_validate_pattern": re.compile(r"^\d{4}[- ]{0,1}\d{3}$|^\d{4}$"),
|
|
419
|
+
|
|
420
|
+
"zip_search_pattern": re.compile(r"\b\d{4}[- ]?\d{3}\b|\b\d{4}\b"),
|
|
421
|
+
|
|
422
|
+
"phone_validate_pattern": re.compile(r"^(\+?351)?9[1236]\d{7}$")
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
143
426
|
# Cutoff score for rapidfuzz in the name standardization function
|
|
144
|
-
STANDARD_THRESHOLD = 40
|
|
427
|
+
STANDARD_THRESHOLD = 40
|
datamarket/utils/nominatim.py
CHANGED
|
@@ -1,12 +1,28 @@
|
|
|
1
|
-
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
from typing import Literal, Optional
|
|
2
5
|
from rapidfuzz import fuzz, process
|
|
3
|
-
from
|
|
6
|
+
from unidecode import unidecode
|
|
7
|
+
from ..params.nominatim import (
|
|
8
|
+
POSTCODE_TO_STATES,
|
|
9
|
+
PROVINCE_TO_POSTCODE,
|
|
10
|
+
PROVINCES,
|
|
11
|
+
STANDARD_THRESHOLD,
|
|
12
|
+
STATES,
|
|
13
|
+
_NORMALIZED_PROVINCE_CACHE,
|
|
14
|
+
COUNTRY_PARSING_RULES
|
|
15
|
+
)
|
|
4
16
|
from .strings import normalize
|
|
5
17
|
|
|
18
|
+
########################################################################################################################
|
|
19
|
+
# FUNCTIONS
|
|
20
|
+
|
|
21
|
+
|
|
6
22
|
def standardize_admin_division(
|
|
7
23
|
name: str,
|
|
8
24
|
level: Literal["province", "state"] = "province",
|
|
9
|
-
country_code: str = "es"
|
|
25
|
+
country_code: str = "es",
|
|
10
26
|
) -> Optional[str]:
|
|
11
27
|
"""
|
|
12
28
|
Normalize and standardize administrative divisions of a given country using RapidFuzz.
|
|
@@ -16,15 +32,17 @@ def standardize_admin_division(
|
|
|
16
32
|
return None
|
|
17
33
|
|
|
18
34
|
country_code = country_code.lower()
|
|
19
|
-
mapping =
|
|
35
|
+
mapping = (
|
|
36
|
+
STATES.get(country_code) if level == "state" else PROVINCES.get(country_code)
|
|
37
|
+
)
|
|
20
38
|
|
|
21
|
-
if not mapping:
|
|
39
|
+
if not mapping: # If country is not standardized, return raw name
|
|
22
40
|
return name
|
|
23
41
|
|
|
24
|
-
normalized_name = normalize(name)
|
|
42
|
+
normalized_name = normalize(name) # Essential for rapidfuzz to work well
|
|
25
43
|
result = process.extractOne(
|
|
26
44
|
normalized_name,
|
|
27
|
-
mapping.keys(),
|
|
45
|
+
mapping.keys(), # Compare with the normalized names in the dict
|
|
28
46
|
scorer=fuzz.WRatio,
|
|
29
47
|
score_cutoff=STANDARD_THRESHOLD,
|
|
30
48
|
)
|
|
@@ -33,6 +51,154 @@ def standardize_admin_division(
|
|
|
33
51
|
return None
|
|
34
52
|
|
|
35
53
|
best_key, score, _ = result
|
|
36
|
-
|
|
54
|
+
|
|
37
55
|
# Return the standardized name corresponding to the normalized name
|
|
38
|
-
return mapping[best_key]
|
|
56
|
+
return mapping[best_key]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def parse_state(
|
|
60
|
+
zip_code: str,
|
|
61
|
+
country_code: str,
|
|
62
|
+
) -> str | None:
|
|
63
|
+
"""Given a zip code and a country code, returns the state in which the zip code is located
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
zip_code (str)
|
|
67
|
+
country_code (str)
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
str | None: state if coincidence found, else None
|
|
71
|
+
"""
|
|
72
|
+
country_postcodes = POSTCODE_TO_STATES.get(country_code, {})
|
|
73
|
+
state = country_postcodes.get(zip_code[:2], None)
|
|
74
|
+
return state
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _province_postcode_match(
|
|
78
|
+
address: str,
|
|
79
|
+
zip_code: str,
|
|
80
|
+
country_code: str,
|
|
81
|
+
) -> str | None:
|
|
82
|
+
"""
|
|
83
|
+
Match and return province with the start of all of its zip codes
|
|
84
|
+
using a pre-computed cache and rapidfuzz for efficient matching.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
address (str)
|
|
88
|
+
zip_code (str)
|
|
89
|
+
country_code (str)
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
str | None:
|
|
93
|
+
"""
|
|
94
|
+
# Get the pre-computed cache for the country
|
|
95
|
+
cache = _NORMALIZED_PROVINCE_CACHE.get(country_code)
|
|
96
|
+
if not cache:
|
|
97
|
+
return None # Country not configured
|
|
98
|
+
|
|
99
|
+
normalized_address = unidecode(address).lower()
|
|
100
|
+
|
|
101
|
+
# Use the cached 'choices' list for the search
|
|
102
|
+
result = process.extractOne(
|
|
103
|
+
normalized_address,
|
|
104
|
+
cache["choices"], # <-- Uses pre-computed list
|
|
105
|
+
scorer=fuzz.partial_ratio,
|
|
106
|
+
score_cutoff=100
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if not result:
|
|
110
|
+
return None # No exact substring match found
|
|
111
|
+
|
|
112
|
+
# We only need the index from the result
|
|
113
|
+
_, _, index = result
|
|
114
|
+
|
|
115
|
+
# Get the original province name from the cached 'keys' list
|
|
116
|
+
original_province = cache["keys"][index] # <-- Uses pre-computed list
|
|
117
|
+
|
|
118
|
+
# Get the postcode prefix from the original map
|
|
119
|
+
province_map = PROVINCE_TO_POSTCODE.get(country_code, {})
|
|
120
|
+
postcode_prefix = province_map[original_province]
|
|
121
|
+
|
|
122
|
+
return (
|
|
123
|
+
postcode_prefix + zip_code[1:]
|
|
124
|
+
if len(zip_code) == 4
|
|
125
|
+
else zip_code
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def _parse_es_zip_code(
|
|
129
|
+
zip_code: str,
|
|
130
|
+
address: str,
|
|
131
|
+
opt_address: str | None,
|
|
132
|
+
) -> str:
|
|
133
|
+
"""parse spain zip code"""
|
|
134
|
+
|
|
135
|
+
# Get the validation regex from params
|
|
136
|
+
validate_regex = COUNTRY_PARSING_RULES['es']['zip_validate_pattern']
|
|
137
|
+
|
|
138
|
+
if validate_regex.match(zip_code):
|
|
139
|
+
return zip_code
|
|
140
|
+
else:
|
|
141
|
+
# Use search regex from params
|
|
142
|
+
pattern = COUNTRY_PARSING_RULES['es']['zip_search_pattern']
|
|
143
|
+
|
|
144
|
+
match = pattern.search(address)
|
|
145
|
+
if match:
|
|
146
|
+
return match.group()
|
|
147
|
+
if opt_address:
|
|
148
|
+
match = pattern.search(opt_address)
|
|
149
|
+
if match:
|
|
150
|
+
return match.group()
|
|
151
|
+
|
|
152
|
+
province_match = _province_postcode_match(address, zip_code, country_code="es")
|
|
153
|
+
return province_match or zip_code
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _parse_pt_zip_code(
|
|
157
|
+
zip_code: str,
|
|
158
|
+
address: str,
|
|
159
|
+
opt_address: str | None,
|
|
160
|
+
) -> str:
|
|
161
|
+
"""parse portugal zip code"""
|
|
162
|
+
|
|
163
|
+
# Get the validation regex from params
|
|
164
|
+
validate_regex = COUNTRY_PARSING_RULES['pt']['zip_validate_pattern']
|
|
165
|
+
|
|
166
|
+
if validate_regex.match(zip_code):
|
|
167
|
+
return zip_code
|
|
168
|
+
else:
|
|
169
|
+
# Use search regex from params
|
|
170
|
+
pattern = COUNTRY_PARSING_RULES['pt']['zip_search_pattern']
|
|
171
|
+
|
|
172
|
+
match = pattern.search(address)
|
|
173
|
+
if match is None and opt_address:
|
|
174
|
+
match = pattern.search(opt_address)
|
|
175
|
+
|
|
176
|
+
return match.group() if match else zip_code
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def parse_zip_code(
|
|
180
|
+
address: str,
|
|
181
|
+
zip_code: str,
|
|
182
|
+
country_code: str,
|
|
183
|
+
opt_address: str | None = None,
|
|
184
|
+
) -> str | None:
|
|
185
|
+
"""Parse and standardize zip code
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
address (str): written address
|
|
189
|
+
zip_code (str)
|
|
190
|
+
country_code (str):
|
|
191
|
+
opt_address (str | None, optional): optional extra address, usually None. Defaults to None.
|
|
192
|
+
|
|
193
|
+
Raises:
|
|
194
|
+
ValueError: when parsing zip code is not supported for the passed country_code
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
str | None
|
|
198
|
+
"""
|
|
199
|
+
if country_code == "es":
|
|
200
|
+
return _parse_es_zip_code(zip_code, address, opt_address)
|
|
201
|
+
elif country_code == "pt":
|
|
202
|
+
return _parse_pt_zip_code(zip_code, address, opt_address)
|
|
203
|
+
else:
|
|
204
|
+
raise ValueError(f"Country code ({country_code}) is not currently supported")
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
########################################################################################################################
|
|
2
2
|
# IMPORTS
|
|
3
|
-
|
|
3
|
+
import re
|
|
4
4
|
import unicodedata
|
|
5
5
|
from enum import Enum, auto
|
|
6
6
|
from typing import Any, Optional, Set, Union
|
|
7
|
-
|
|
8
7
|
import numpy as np
|
|
9
8
|
from inflection import camelize, parameterize, titleize, underscore
|
|
10
9
|
from string_utils import prettify, strip_html
|
|
@@ -37,7 +36,9 @@ class NamingConvention(Enum):
|
|
|
37
36
|
# FUNCTIONS
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
def get_unidecoded_text(
|
|
39
|
+
def get_unidecoded_text(
|
|
40
|
+
input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False
|
|
41
|
+
) -> str:
|
|
41
42
|
"""
|
|
42
43
|
Processes a string by unidecoding characters, optionally lowercasing them,
|
|
43
44
|
while preserving a specified set of allowed characters.
|
|
@@ -64,7 +65,9 @@ def get_unidecoded_text(input_text: str, allowed_chars: Set[str], apply_lowercas
|
|
|
64
65
|
return "".join(chars_list)
|
|
65
66
|
|
|
66
67
|
|
|
67
|
-
def transliterate_symbols(
|
|
68
|
+
def transliterate_symbols(
|
|
69
|
+
s: str, allowed_symbols_set: Optional[Set[str]] = None
|
|
70
|
+
) -> str:
|
|
68
71
|
"""
|
|
69
72
|
Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
|
|
70
73
|
with spaces replaced by underscores. Other characters, or characters in allowed_symbols_set, remain unchanged.
|
|
@@ -179,7 +182,9 @@ def normalize(
|
|
|
179
182
|
|
|
180
183
|
for c in intermediate_text:
|
|
181
184
|
cat = unicodedata.category(c)
|
|
182
|
-
if
|
|
185
|
+
if (
|
|
186
|
+
c in _allowed_symbols_set or c.isalnum()
|
|
187
|
+
): # Allowed symbols are part of tokens
|
|
183
188
|
current_token_chars.append(c)
|
|
184
189
|
elif mode is NormalizationMode.FULL and cat.startswith("S"):
|
|
185
190
|
# Transliterate S* category symbols not in allowed_symbols
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Literal
|
|
6
|
+
from ...params.nominatim import COUNTRY_PARSING_RULES
|
|
7
|
+
|
|
8
|
+
########################################################################################################################
|
|
9
|
+
# FUNCTIONS
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _standardize_es_phone_number(number: str) -> str | None:
|
|
13
|
+
"""Standardize phone numbers from Spain using regex validation.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
number (str): cleaned, digits-only phone number
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
str | None: standardized 9-digit phone number
|
|
20
|
+
"""
|
|
21
|
+
# Get the validation regex from params
|
|
22
|
+
pattern = COUNTRY_PARSING_RULES["es"]["phone_validate_pattern"]
|
|
23
|
+
|
|
24
|
+
# Validate and extract in one step
|
|
25
|
+
match = pattern.match(number)
|
|
26
|
+
|
|
27
|
+
# Return the captured group (the 9-digit number)
|
|
28
|
+
return match.group(1) if match else None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _standardize_pt_phone_number(number: str) -> str | None:
|
|
32
|
+
"""Standardize phone numbers from Portugal using regex validation.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
number (str): cleaned, digits-only phone number
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
str | None: standardized 9-digit phone number
|
|
39
|
+
"""
|
|
40
|
+
# Get the validation regex from params
|
|
41
|
+
pattern = COUNTRY_PARSING_RULES["pt"]["phone_validate_pattern"]
|
|
42
|
+
|
|
43
|
+
# Validate and extract in one step
|
|
44
|
+
match = pattern.match(number)
|
|
45
|
+
|
|
46
|
+
# Return the captured group (the 9-digit number)
|
|
47
|
+
return match.group(1) if match else None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str | None:
|
|
51
|
+
"""Clean and standardize phone number from a certain country_code
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
number (str): phone number
|
|
55
|
+
country_code (Literal["es", "pt"]): country code of the phone number to parse
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
ValueError: when parsing is not supported for a certain country
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
str | None: standardized phone number
|
|
62
|
+
"""
|
|
63
|
+
clean_number = re.sub(r"\D", "", number)
|
|
64
|
+
if country_code == "es":
|
|
65
|
+
return _standardize_es_phone_number(clean_number)
|
|
66
|
+
elif country_code == "pt":
|
|
67
|
+
return _standardize_pt_phone_number(clean_number)
|
|
68
|
+
else:
|
|
69
|
+
raise ValueError(f"Country code ({country_code}) is not currently supported")
|
|
@@ -2,7 +2,7 @@ datamarket/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
2
2
|
datamarket/exceptions/__init__.py,sha256=-Vu-RZNKjW6fYCLqbUJTkKNuHeA8Yi_gyR50oZNaA_8,33
|
|
3
3
|
datamarket/exceptions/main.py,sha256=MP5ql6M7DoMbBf-Dg_2ohcUFdWXgzv-dXHntPPit31s,453
|
|
4
4
|
datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
datamarket/interfaces/alchemy.py,sha256=
|
|
5
|
+
datamarket/interfaces/alchemy.py,sha256=i2lKLLLy3-jpbzV3-jxfRCXTy7jRoTsNU3063pmSonk,15749
|
|
6
6
|
datamarket/interfaces/aws.py,sha256=co5JkC3iFIp-0FqdYX4eKy3_m71LhZKuJoW6kXwEImc,4780
|
|
7
7
|
datamarket/interfaces/azure.py,sha256=PnPlo95skYiq63qYa4QDvEnVYi2JblPmMSfbTsmXhFs,4937
|
|
8
8
|
datamarket/interfaces/drive.py,sha256=3nhx3THr2SHNWKYwme9F2nPpvsqyEMFIxz0whF2FjHk,4840
|
|
@@ -12,23 +12,24 @@ datamarket/interfaces/peerdb.py,sha256=sO451wEGNb_0DDwchZ6eBVYKltqHM5XKau-WsfspX
|
|
|
12
12
|
datamarket/interfaces/proxy.py,sha256=Uu-dHvpQOLNBZPGHAanLXnKT1789ArcHfOw8exECt34,5398
|
|
13
13
|
datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
|
|
14
14
|
datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
datamarket/params/nominatim.py,sha256=
|
|
15
|
+
datamarket/params/nominatim.py,sha256=Xl0mBls_Yz7y8nU6-boNTMDYsxvJRIFpZyMT2gLOqvs,11968
|
|
16
16
|
datamarket/utils/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
|
|
17
17
|
datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
|
|
18
18
|
datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
|
|
19
19
|
datamarket/utils/main.py,sha256=KYHjDOps6_Q3TFV_Jj7MLj-L9Evx05AXELCvp06BARU,5857
|
|
20
|
-
datamarket/utils/nominatim.py,sha256=
|
|
20
|
+
datamarket/utils/nominatim.py,sha256=IxexKY2KOlDhiKtzsqQfoVUjJXPxJl7tn3iHUaQKg08,5795
|
|
21
21
|
datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
datamarket/utils/playwright/async_api.py,sha256=UbA2D4ScBtYeMfrRjly4RO-s8wXIub9c05J1eoOCpsQ,5782
|
|
23
23
|
datamarket/utils/playwright/sync_api.py,sha256=Tw_-KLB3vipFuEQwcX8iCbj7giCzcwXB-bhl_ncR-2Q,5542
|
|
24
24
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
25
25
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
26
26
|
datamarket/utils/strings/__init__.py,sha256=b6TYOT9v7y9ID-lDyZk4E8BH2uIPbsF2ZSLGjCQ1MCQ,43
|
|
27
|
-
datamarket/utils/strings/normalization.py,sha256=
|
|
27
|
+
datamarket/utils/strings/normalization.py,sha256=UBluU6ABY6aCpnd02F7L7HcivVSisRJ9IUXdj9D1MyE,9050
|
|
28
28
|
datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnmH3FPlgUUjM,5246
|
|
29
|
+
datamarket/utils/strings/standardization.py,sha256=xl4I6F3brDFdRWKy7jKnOIIEo2YsqcBcPa2p5TJGRC0,2236
|
|
29
30
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
30
31
|
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
31
|
-
datamarket-0.7.
|
|
32
|
-
datamarket-0.7.
|
|
33
|
-
datamarket-0.7.
|
|
34
|
-
datamarket-0.7.
|
|
32
|
+
datamarket-0.7.98.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
33
|
+
datamarket-0.7.98.dist-info/METADATA,sha256=f8YsBdGrtjmUOroA-qvv1bEpnt1LSfrCxqt8z6kF_tk,7381
|
|
34
|
+
datamarket-0.7.98.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
35
|
+
datamarket-0.7.98.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|