datamarket 0.7.89__py3-none-any.whl → 0.7.125__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,7 @@
1
+ import re
2
+
3
+ from unidecode import unidecode
4
+
1
5
  CITY_TO_PROVINCE = {"Madrid": "Madrid"}
2
6
 
3
7
  POSTCODES = {
@@ -55,24 +59,435 @@ POSTCODES = {
55
59
  "52": "Melilla",
56
60
  }
57
61
 
62
+ # Mapping of normalized names (for comparison) to standardized names (for storing)
63
+ # for each corresponding country code
58
64
  STATES = {
59
- "Andalucía",
60
- "Aragón",
61
- "Asturias",
62
- "Baleares",
63
- "Canarias",
64
- "Cantabria",
65
- "Castilla-La Mancha",
66
- "Castilla y León",
67
- "Cataluña",
68
- "Ceuta",
69
- "Comunidad Valenciana",
70
- "Extremadura",
71
- "Galicia",
72
- "La Rioja",
73
- "Madrid",
74
- "Melilla",
75
- "Murcia",
76
- "Navarra",
77
- "País Vasco",
65
+ "es": {
66
+ "andalucia": "Andalucía",
67
+ "aragon": "Aragón",
68
+ "asturias": "Asturias",
69
+ "baleares": "Baleares",
70
+ "canarias": "Canarias",
71
+ "cantabria": "Cantabria",
72
+ "castilla la mancha": "Castilla-La Mancha",
73
+ "castilla y leon": "Castilla y León",
74
+ "cataluna": "Cataluña",
75
+ "ceuta": "Ceuta",
76
+ "comunidad valenciana": "Comunidad Valenciana",
77
+ "extremadura": "Extremadura",
78
+ "galicia": "Galicia",
79
+ "la rioja": "La Rioja",
80
+ "madrid": "Comunidad de Madrid",
81
+ "melilla": "Melilla",
82
+ "murcia": "Murcia",
83
+ "navarra": "Navarra",
84
+ "pais vasco": "País Vasco",
85
+ "euskadi": "País Vasco", # Alias not caught by rapidfuzz
86
+ }
87
+ }
88
+
89
+ PROVINCES = {
90
+ "es": {
91
+ "alava": "Álava",
92
+ "araba": "Álava", # Alias not caught by rapidfuzz
93
+ "albacete": "Albacete",
94
+ "alicante": "Alicante",
95
+ "almeria": "Almería",
96
+ "asturias": "Asturias",
97
+ "avila": "Ávila",
98
+ "badajoz": "Badajoz",
99
+ "barcelona": "Barcelona",
100
+ "bizkaia": "Vizcaya",
101
+ "burgos": "Burgos",
102
+ "caceres": "Cáceres",
103
+ "cadiz": "Cádiz",
104
+ "cantabria": "Cantabria",
105
+ "castellon": "Castellón",
106
+ "ceuta": "Ceuta", # Considered province by opensm and/or geonames
107
+ "ciudad real": "Ciudad Real",
108
+ "cordoba": "Córdoba",
109
+ "cuenca": "Cuenca",
110
+ "gipuzkoa": "Gipuzkoa",
111
+ "gerona": "Gerona",
112
+ "granada": "Granada",
113
+ "guadalajara": "Guadalajara",
114
+ "huelva": "Huelva",
115
+ "huesca": "Huesca",
116
+ "islas baleares": "Islas Baleares",
117
+ "jaen": "Jaén",
118
+ "la coruna": "La Coruña",
119
+ "la rioja": "La Rioja",
120
+ "las palmas": "Las Palmas",
121
+ "leon": "León",
122
+ "lerida": "Lérida",
123
+ "lugo": "Lugo",
124
+ "madrid": "Madrid",
125
+ "malaga": "Málaga",
126
+ "melilla": "Melilla", # Considered province by opensm and/or geonames
127
+ "murcia": "Murcia",
128
+ "navarra": "Navarra",
129
+ "orense": "Orense",
130
+ "palencia": "Palencia",
131
+ "pontevedra": "Pontevedra",
132
+ "salamanca": "Salamanca",
133
+ "santa cruz de tenerife": "Santa Cruz de Tenerife",
134
+ "segovia": "Segovia",
135
+ "sevilla": "Sevilla",
136
+ "soria": "Soria",
137
+ "tarragona": "Tarragona",
138
+ "teruel": "Teruel",
139
+ "toledo": "Toledo",
140
+ "valencia": "Valencia",
141
+ "valladolid": "Valladolid",
142
+ "zamora": "Zamora",
143
+ "zaragoza": "Zaragoza",
144
+ }
145
+ }
146
+
147
+
148
+ PROVINCE_TO_POSTCODE = {
149
+ "es": {
150
+ "A Coruña": "15",
151
+ "Álava": "01",
152
+ "Araba": "01",
153
+ "Alacant": "03",
154
+ "Alicante": "03",
155
+ "Albacete": "02",
156
+ "Almería": "04",
157
+ "Asturias": "33",
158
+ "Ávila": "05",
159
+ "Badajoz": "06",
160
+ "Baleares": "07",
161
+ "Barcelona": "08",
162
+ "Bizkaia": "48",
163
+ "Burgos": "09",
164
+ "Cáceres": "10",
165
+ "Cádiz": "11",
166
+ "Cantabria": "39",
167
+ "Castelló": "12",
168
+ "Castellón": "12",
169
+ "Ceuta": "51",
170
+ "Ciudad Real": "13",
171
+ "Córdoba": "14",
172
+ "Cuenca": "16",
173
+ "Gerona": "17",
174
+ "Gipuzkoa": "20",
175
+ "Girona": "17",
176
+ "Granada": "18",
177
+ "Guadalajara": "19",
178
+ "Guipúzcoa": "20",
179
+ "Huelva": "21",
180
+ "Huesca": "22",
181
+ "Illes Balears": "07",
182
+ "Jaén": "23",
183
+ "La Coruña": "15",
184
+ "La Rioja": "26",
185
+ "Las Palmas": "35",
186
+ "León": "24",
187
+ "Lérida": "25",
188
+ "Lleida": "25",
189
+ "Lugo": "27",
190
+ "Madrid": "28",
191
+ "Málaga": "29",
192
+ "Melilla": "52",
193
+ "Murcia": "30",
194
+ "Navarra": "31",
195
+ "Orense": "32",
196
+ "Ourense": "32",
197
+ "Palencia": "34",
198
+ "Pontevedra": "36",
199
+ "Salamanca": "37",
200
+ "Santa Cruz de Tenerife": "38",
201
+ "Segovia": "40",
202
+ "Sevilla": "41",
203
+ "Soria": "42",
204
+ "Tarragona": "43",
205
+ "Teruel": "44",
206
+ "Toledo": "45",
207
+ "València": "46",
208
+ "Valencia": "46",
209
+ "Valladolid": "47",
210
+ "Vizcaya": "48",
211
+ "Zamora": "49",
212
+ "Zaragoza": "50",
213
+ },
214
+ "pt": {
215
+ "Aveiro": "3",
216
+ "Beja": "7",
217
+ "Braga": "4",
218
+ "Bragança": "5",
219
+ "Castelo Branco": "6",
220
+ "Coimbra": "3",
221
+ "Évora": "7",
222
+ "Faro": "8",
223
+ "Guarda": "6",
224
+ "Leiria": "2",
225
+ "Lisboa": "1",
226
+ "Portalegre": "7",
227
+ "Porto": "4",
228
+ "Santarém": "2",
229
+ "Setúbal": "2",
230
+ "Viana do Castelo": "4",
231
+ "Vila Real": "5",
232
+ "Viseu": "3",
233
+ "Açores": "9",
234
+ "Madeira": "9",
235
+ },
236
+ }
237
+
238
+
239
+ POSTCODE_TO_STATES = {
240
+ "es": {
241
+ # Andalucía
242
+ "04": "Andalucía",
243
+ "11": "Andalucía",
244
+ "14": "Andalucía",
245
+ "18": "Andalucía",
246
+ "21": "Andalucía",
247
+ "23": "Andalucía",
248
+ "29": "Andalucía",
249
+ "41": "Andalucía",
250
+ # Aragón
251
+ "22": "Aragón",
252
+ "44": "Aragón",
253
+ "50": "Aragón",
254
+ # Asturias
255
+ "33": "Principado de Asturias",
256
+ # Baleares
257
+ "07": "Islas Baleares",
258
+ # Canarias
259
+ "35": "Canarias",
260
+ "38": "Canarias",
261
+ # Cantabria
262
+ "39": "Cantabria",
263
+ # Castilla y León
264
+ "05": "Castilla y León",
265
+ "09": "Castilla y León",
266
+ "24": "Castilla y León",
267
+ "34": "Castilla y León",
268
+ "37": "Castilla y León",
269
+ "40": "Castilla y León",
270
+ "42": "Castilla y León",
271
+ "47": "Castilla y León",
272
+ "49": "Castilla y León",
273
+ # Castilla-La Mancha
274
+ "02": "Castilla-La Mancha",
275
+ "13": "Castilla-La Mancha",
276
+ "16": "Castilla-La Mancha",
277
+ "19": "Castilla-La Mancha",
278
+ "45": "Castilla-La Mancha",
279
+ # Cataluña
280
+ "08": "Cataluña",
281
+ "17": "Cataluña",
282
+ "25": "Cataluña",
283
+ "43": "Cataluña",
284
+ # Comunidad Valenciana
285
+ "03": "Comunidad Valenciana",
286
+ "12": "Comunidad Valenciana",
287
+ "46": "Comunidad Valenciana",
288
+ # Extremadura
289
+ "06": "Extremadura",
290
+ "10": "Extremadura",
291
+ # Galicia
292
+ "15": "Galicia",
293
+ "27": "Galicia",
294
+ "32": "Galicia",
295
+ "36": "Galicia",
296
+ # Madrid
297
+ "28": "Comunidad de Madrid",
298
+ # Murcia
299
+ "30": "Región de Murcia",
300
+ # Navarra
301
+ "31": "Comunidad Foral de Navarra",
302
+ # País Vasco
303
+ "01": "País Vasco",
304
+ "20": "País Vasco",
305
+ "48": "País Vasco",
306
+ # La Rioja
307
+ "26": "La Rioja",
308
+ # Ciudades Autónomas
309
+ "51": "Ceuta",
310
+ "52": "Melilla",
311
+ },
312
+ "pt": { # --- NORTE ---
313
+ "40": "Porto",
314
+ "41": "Porto",
315
+ "42": "Porto",
316
+ "43": "Porto",
317
+ "44": "Porto",
318
+ "45": "Aveiro", # Concelhos do norte de Aveiro, na fronteira com Porto.
319
+ "47": "Braga",
320
+ "48": "Braga", # Guimarães.
321
+ "49": "Viana do Castelo",
322
+ "50": "Vila Real",
323
+ "51": "Vila Real",
324
+ "52": "Vila Real",
325
+ "53": "Vila Real / Bragança", # Zona fronteiriça.
326
+ "54": "Bragança",
327
+ # --- CENTRO ---
328
+ "60": "Castelo Branco",
329
+ "61": "Castelo Branco",
330
+ "62": "Castelo Branco",
331
+ "63": "Guarda",
332
+ "30": "Coimbra",
333
+ "31": "Coimbra",
334
+ "32": "Coimbra",
335
+ "33": "Coimbra",
336
+ "34": "Viseu",
337
+ "35": "Viseu",
338
+ "37": "Aveiro",
339
+ "38": "Aveiro",
340
+ "24": "Leiria",
341
+ # --- ÁREA METROPOLITANA DE LISBOA e arredores ---
342
+ "10": "Lisboa",
343
+ "11": "Lisboa",
344
+ "12": "Lisboa",
345
+ "13": "Lisboa",
346
+ "14": "Lisboa",
347
+ "15": "Lisboa",
348
+ "16": "Lisboa",
349
+ "17": "Lisboa",
350
+ "18": "Lisboa",
351
+ "19": "Lisboa",
352
+ "20": "Santarém",
353
+ "21": "Santarém",
354
+ "22": "Santarém",
355
+ "23": "Santarém", # Tomar e Torres Novas.
356
+ "25": "Lisboa", # Concelhos como Torres Vedras, Mafra, Alenquer.
357
+ "26": "Lisboa", # Concelhos como Loures, Amadora, Odivelas.
358
+ "27": "Lisboa", # Concelhos como Sintra, Cascais, Oeiras.
359
+ "28": "Setúbal",
360
+ "29": "Setúbal",
361
+ # --- ALENTEJO ---
362
+ "70": "Évora",
363
+ "71": "Évora",
364
+ "72": "Évora",
365
+ "73": "Portalegre",
366
+ "74": "Portalegre",
367
+ "75": "Setúbal", # Litoral Alentejano (Sines, Grândola), administrativamente de Setúbal.
368
+ "76": "Beja",
369
+ "77": "Beja",
370
+ "78": "Beja",
371
+ "79": "Beja",
372
+ # --- ALGARVE ---
373
+ "80": "Faro",
374
+ "81": "Faro",
375
+ "82": "Faro",
376
+ "83": "Faro",
377
+ "84": "Faro",
378
+ "85": "Faro",
379
+ "86": "Faro",
380
+ "87": "Faro",
381
+ "88": "Faro",
382
+ "89": "Faro",
383
+ # --- REGIÕES AUTÓNOMAS ---
384
+ "90": "Madeira",
385
+ "91": "Madeira",
386
+ "92": "Madeira",
387
+ "93": "Madeira",
388
+ "95": "Açores", # Ilha de São Miguel (Ponta Delgada).
389
+ "96": "Açores", # Ilha de São Miguel (Ribeira Grande) e Santa Maria.
390
+ "97": "Açores", # Ilha Terceira (Angra do Heroísmo).
391
+ "98": "Açores", # Ilhas de São Jorge, Graciosa, Faial, Pico.
392
+ "99": "Açores", # Ilhas de Flores e Corvo.
393
+ },
394
+ }
395
+
396
+ _NORMALIZED_PROVINCE_CACHE = {}
397
+ for country, provinces in PROVINCE_TO_POSTCODE.items():
398
+ # Get the original keys (e.g., "A Coruña", "Álava")
399
+ original_keys = list(provinces.keys())
400
+
401
+ # Create the normalized list (e.g., "a coruna", "alava")
402
+ normalized_choices = [unidecode(p).lower() for p in original_keys]
403
+
404
+ _NORMALIZED_PROVINCE_CACHE[country] = {
405
+ "choices": normalized_choices, # The list for rapidfuzz to search in
406
+ "keys": original_keys, # The list to find the name by index
407
+ }
408
+
409
+ # Source: https://github.com/ariankoochak/regex-patterns-of-all-countries
410
+ COUNTRY_PARSING_RULES = {
411
+ "es": {
412
+ "zip_validate_pattern": re.compile(r"^\d{5}$"),
413
+ "zip_search_pattern": re.compile(r"\b\d{5}\b"),
414
+ "phone_validate_pattern": re.compile(r"^(\+?34)?[67]\d{8}$"),
415
+ },
416
+ "pt": {
417
+ "zip_validate_pattern": re.compile(r"^\d{4}[- ]{0,1}\d{3}$|^\d{4}$"),
418
+ "zip_search_pattern": re.compile(r"\b\d{4}[- ]?\d{3}\b|\b\d{4}\b"),
419
+ "phone_validate_pattern": re.compile(r"^(\+?351)?9[1236]\d{7}$"),
420
+ },
421
+ }
422
+
423
+ MADRID_DISTRICT_DIRECT_PATCH = {
424
+ # Correcciones directas
425
+ "Aravaca": "Moncloa-Aravaca",
426
+ "Puerta de Hierro": "Fuencarral-El Pardo",
427
+ "Palacio": "Centro",
428
+ "Argüelles": "Moncloa-Aravaca",
429
+ "Barrio de La Estación": "Latina",
430
+ "Casa de Campo": "Moncloa-Aravaca",
431
+ "Universidad": "Centro",
432
+ "Valdezarza": "Moncloa-Aravaca",
433
+ "Cortes": "Centro",
434
+ "Barrio de la Latina": "Centro",
435
+ "Ciudad Universitaria": "Moncloa-Aravaca",
436
+ "Embajadores": "Centro",
437
+ "Justicia": "Centro",
438
+ "Sol": "Centro",
439
+ "Barrio de los Austrias": "Centro",
78
440
  }
441
+
442
+ MADRID_DISTRICT_QUARTER_PATCH = {
443
+ # Reglas dependientes del quarter
444
+ ("Centro", "Atocha"): "Arganzuela",
445
+ ("Centro", "Gaztambide"): "Chamberí",
446
+ ("Centro", "Imperial"): "Arganzuela",
447
+ ("Centro", "Palos de Moguer"): "Arganzuela",
448
+ ("Arganzuela", "Embajadores"): "Centro",
449
+ ("Salamanca", "La Elipa"): "Ciudad Lineal",
450
+ ("Salamanca", "Ventas"): "Ciudad Lineal",
451
+ ("Tetuán", "La Paz"): "Fuencarral-El Pardo",
452
+ ("Tetuán", "San Cristóbal"): "Villaverde",
453
+ ("Tetuán", "Colonia de San Cristóbal"): "Villaverde",
454
+ ("Tetuán", "Valdezarza"): "Moncloa-Aravaca",
455
+ ("Chamberí", "Ciudad Universitaria"): "Moncloa-Aravaca",
456
+ ("Chamberí", "Justicia"): "Centro",
457
+ ("Chamberí", "Universidad"): "Centro",
458
+ ("Fuencarral-El Pardo", "Castilla"): "Chamartín",
459
+ ("Fuencarral-El Pardo", "Valdeacederas"): "Tetuán",
460
+ ("Fuencarral-El Pardo", "Valdezarza"): "Moncloa-Aravaca",
461
+ ("Moncloa-Aravaca", "Bellas Vistas"): "Tetuán",
462
+ ("Moncloa-Aravaca", "Berruguete"): "Tetuán",
463
+ ("Moncloa-Aravaca", "Campamento"): "Latina",
464
+ ("Moncloa-Aravaca", "Gaztambide"): "Chamberí",
465
+ ("Moncloa-Aravaca", "Lucero"): "Latina",
466
+ ("Moncloa-Aravaca", "Valdeacederas"): "Tetuán",
467
+ ("Moncloa-Aravaca", "Vallehermoso"): "Chamberí",
468
+ ("Latina", "Casa de Campo"): "Moncloa-Aravaca",
469
+ ("Villaverde", "San Fermín"): "Usera",
470
+ ("San Blas - Canillejas", "Concepción"): "Ciudad Lineal",
471
+ ("San Blas - Canillejas", "Quintana"): "Ciudad Lineal",
472
+ ("Barajas", "Palomas"): "Hortaleza",
473
+ }
474
+
475
+ MADRID_QUARTER_DIRECT_PATCH = {
476
+ "Barrio de la Latina": "Palacio",
477
+ "Barrio de las Letras": "Cortes",
478
+ "Barrio de los Austrias": "Palacio",
479
+ "Colonia de San Cristóbal": "San Cristóbal",
480
+ "Encinar de los Reyes": "Valdefuentes",
481
+ "La Elipa": "Ventas",
482
+ "Las Cárcavas - San Antonio": "Valdefuentes",
483
+ "Lavapiés": "Embajadores",
484
+ "Montecarmelo": "El Goloso",
485
+ "Puerta de Hierro": "Ciudad Universitaria",
486
+ "Villaverde Alto, Casco Histórico de Villaverde": "San Andrés",
487
+ "Villaverde Bajo": "Los Rosales",
488
+ "Virgen del Cortijo": "Valdefuentes",
489
+ "Las Acacias": "Acacias",
490
+ }
491
+
492
+ # Cutoff score for rapidfuzz in the name standardization function
493
+ STANDARD_THRESHOLD = 40
@@ -3,20 +3,23 @@
3
3
 
4
4
  import re
5
5
  import unicodedata
6
+
6
7
  import inflection
7
8
 
8
9
  ########################################################################################################################
9
10
  # FUNCTIONS
10
11
 
12
+
11
13
  def process_task_name(task_id):
12
- task_id = ''.join(
13
- f"_{unicodedata.name(c)}_" if not c.isalnum() else c for c in task_id
14
- if c.isalnum() or (unicodedata.category(c) not in ('Cc', 'Cf', 'Cs', 'Co', 'Cn'))
14
+ task_id = "".join(
15
+ f"_{unicodedata.name(c)}_" if not c.isalnum() else c
16
+ for c in task_id
17
+ if c.isalnum() or (unicodedata.category(c) not in ("Cc", "Cf", "Cs", "Co", "Cn"))
15
18
  )
16
- task_id = inflection.parameterize(task_id, separator='_')
19
+ task_id = inflection.parameterize(task_id, separator="_")
17
20
  task_id = task_id.lower()
18
- task_id = task_id.strip('_')
19
- task_id = re.sub(r'_+', '_', task_id)
21
+ task_id = task_id.strip("_")
22
+ task_id = re.sub(r"_+", "_", task_id)
20
23
  if task_id[0].isdigit():
21
- task_id = 'task_' + task_id
24
+ task_id = "task_" + task_id
22
25
  return task_id
@@ -8,6 +8,7 @@ from sqlalchemy.ext.declarative import declarative_base
8
8
 
9
9
  Base = declarative_base()
10
10
 
11
+
11
12
  class View(Base):
12
13
  __abstract__ = True
13
14
  is_view = True
@@ -19,4 +20,4 @@ class View(Base):
19
20
  """
20
21
  conn.execute(f"""
21
22
  CREATE OR REPLACE VIEW {cls.__tablename__} AS {query}
22
- """)
23
+ """)