datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/__init__.py +0 -1
- datamarket/exceptions/__init__.py +1 -0
- datamarket/exceptions/main.py +118 -0
- datamarket/interfaces/alchemy.py +1934 -25
- datamarket/interfaces/aws.py +81 -14
- datamarket/interfaces/azure.py +127 -0
- datamarket/interfaces/drive.py +60 -10
- datamarket/interfaces/ftp.py +37 -14
- datamarket/interfaces/llm.py +1220 -0
- datamarket/interfaces/nominatim.py +314 -42
- datamarket/interfaces/peerdb.py +272 -104
- datamarket/interfaces/proxy.py +354 -50
- datamarket/interfaces/tinybird.py +7 -15
- datamarket/params/nominatim.py +439 -0
- datamarket/utils/__init__.py +1 -1
- datamarket/utils/airflow.py +10 -7
- datamarket/utils/alchemy.py +2 -1
- datamarket/utils/logs.py +88 -0
- datamarket/utils/main.py +138 -10
- datamarket/utils/nominatim.py +201 -0
- datamarket/utils/playwright/__init__.py +0 -0
- datamarket/utils/playwright/async_api.py +274 -0
- datamarket/utils/playwright/sync_api.py +281 -0
- datamarket/utils/requests.py +655 -0
- datamarket/utils/selenium.py +6 -12
- datamarket/utils/strings/__init__.py +1 -0
- datamarket/utils/strings/normalization.py +217 -0
- datamarket/utils/strings/obfuscation.py +153 -0
- datamarket/utils/strings/standardization.py +40 -0
- datamarket/utils/typer.py +2 -1
- datamarket/utils/types.py +1 -0
- datamarket-0.10.3.dist-info/METADATA +172 -0
- datamarket-0.10.3.dist-info/RECORD +38 -0
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
- datamarket-0.6.0.dist-info/METADATA +0 -49
- datamarket-0.6.0.dist-info/RECORD +0 -24
- datamarket-0.6.0.dist-info/top_level.txt +0 -1
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
datamarket/params/nominatim.py
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from unidecode import unidecode
|
|
4
|
+
|
|
5
|
+
CITY_TO_PROVINCE = {"Madrid": "Madrid"}
|
|
6
|
+
|
|
1
7
|
POSTCODES = {
|
|
2
8
|
"01": "Álava",
|
|
3
9
|
"02": "Albacete",
|
|
@@ -52,3 +58,436 @@ POSTCODES = {
|
|
|
52
58
|
"51": "Ceuta",
|
|
53
59
|
"52": "Melilla",
|
|
54
60
|
}
|
|
61
|
+
|
|
62
|
+
# Mapping of normalized names (for comparison) to standardized names (for storing)
|
|
63
|
+
# for each corresponding country code
|
|
64
|
+
STATES = {
|
|
65
|
+
"es": {
|
|
66
|
+
"andalucia": "Andalucía",
|
|
67
|
+
"aragon": "Aragón",
|
|
68
|
+
"asturias": "Asturias",
|
|
69
|
+
"baleares": "Baleares",
|
|
70
|
+
"canarias": "Canarias",
|
|
71
|
+
"cantabria": "Cantabria",
|
|
72
|
+
"castilla la mancha": "Castilla-La Mancha",
|
|
73
|
+
"castilla y leon": "Castilla y León",
|
|
74
|
+
"cataluna": "Cataluña",
|
|
75
|
+
"ceuta": "Ceuta",
|
|
76
|
+
"comunidad valenciana": "Comunidad Valenciana",
|
|
77
|
+
"extremadura": "Extremadura",
|
|
78
|
+
"galicia": "Galicia",
|
|
79
|
+
"la rioja": "La Rioja",
|
|
80
|
+
"madrid": "Comunidad de Madrid",
|
|
81
|
+
"melilla": "Melilla",
|
|
82
|
+
"murcia": "Murcia",
|
|
83
|
+
"navarra": "Navarra",
|
|
84
|
+
"pais vasco": "País Vasco",
|
|
85
|
+
"euskadi": "País Vasco", # Alias not caught by rapidfuzz
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
PROVINCES = {
|
|
90
|
+
"es": {
|
|
91
|
+
"alava": "Álava",
|
|
92
|
+
"araba": "Álava", # Alias not caught by rapidfuzz
|
|
93
|
+
"albacete": "Albacete",
|
|
94
|
+
"alicante": "Alicante",
|
|
95
|
+
"almeria": "Almería",
|
|
96
|
+
"asturias": "Asturias",
|
|
97
|
+
"avila": "Ávila",
|
|
98
|
+
"badajoz": "Badajoz",
|
|
99
|
+
"barcelona": "Barcelona",
|
|
100
|
+
"bizkaia": "Vizcaya",
|
|
101
|
+
"burgos": "Burgos",
|
|
102
|
+
"caceres": "Cáceres",
|
|
103
|
+
"cadiz": "Cádiz",
|
|
104
|
+
"cantabria": "Cantabria",
|
|
105
|
+
"castellon": "Castellón",
|
|
106
|
+
"ceuta": "Ceuta", # Considered province by opensm and/or geonames
|
|
107
|
+
"ciudad real": "Ciudad Real",
|
|
108
|
+
"cordoba": "Córdoba",
|
|
109
|
+
"cuenca": "Cuenca",
|
|
110
|
+
"gipuzkoa": "Gipuzkoa",
|
|
111
|
+
"gerona": "Gerona",
|
|
112
|
+
"granada": "Granada",
|
|
113
|
+
"guadalajara": "Guadalajara",
|
|
114
|
+
"huelva": "Huelva",
|
|
115
|
+
"huesca": "Huesca",
|
|
116
|
+
"islas baleares": "Islas Baleares",
|
|
117
|
+
"jaen": "Jaén",
|
|
118
|
+
"la coruna": "La Coruña",
|
|
119
|
+
"la rioja": "La Rioja",
|
|
120
|
+
"las palmas": "Las Palmas",
|
|
121
|
+
"leon": "León",
|
|
122
|
+
"lerida": "Lérida",
|
|
123
|
+
"lugo": "Lugo",
|
|
124
|
+
"madrid": "Madrid",
|
|
125
|
+
"malaga": "Málaga",
|
|
126
|
+
"melilla": "Melilla", # Considered province by opensm and/or geonames
|
|
127
|
+
"murcia": "Murcia",
|
|
128
|
+
"navarra": "Navarra",
|
|
129
|
+
"orense": "Orense",
|
|
130
|
+
"palencia": "Palencia",
|
|
131
|
+
"pontevedra": "Pontevedra",
|
|
132
|
+
"salamanca": "Salamanca",
|
|
133
|
+
"santa cruz de tenerife": "Santa Cruz de Tenerife",
|
|
134
|
+
"segovia": "Segovia",
|
|
135
|
+
"sevilla": "Sevilla",
|
|
136
|
+
"soria": "Soria",
|
|
137
|
+
"tarragona": "Tarragona",
|
|
138
|
+
"teruel": "Teruel",
|
|
139
|
+
"toledo": "Toledo",
|
|
140
|
+
"valencia": "Valencia",
|
|
141
|
+
"valladolid": "Valladolid",
|
|
142
|
+
"zamora": "Zamora",
|
|
143
|
+
"zaragoza": "Zaragoza",
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
PROVINCE_TO_POSTCODE = {
|
|
149
|
+
"es": {
|
|
150
|
+
"A Coruña": "15",
|
|
151
|
+
"Álava": "01",
|
|
152
|
+
"Araba": "01",
|
|
153
|
+
"Alacant": "03",
|
|
154
|
+
"Alicante": "03",
|
|
155
|
+
"Albacete": "02",
|
|
156
|
+
"Almería": "04",
|
|
157
|
+
"Asturias": "33",
|
|
158
|
+
"Ávila": "05",
|
|
159
|
+
"Badajoz": "06",
|
|
160
|
+
"Baleares": "07",
|
|
161
|
+
"Barcelona": "08",
|
|
162
|
+
"Bizkaia": "48",
|
|
163
|
+
"Burgos": "09",
|
|
164
|
+
"Cáceres": "10",
|
|
165
|
+
"Cádiz": "11",
|
|
166
|
+
"Cantabria": "39",
|
|
167
|
+
"Castelló": "12",
|
|
168
|
+
"Castellón": "12",
|
|
169
|
+
"Ceuta": "51",
|
|
170
|
+
"Ciudad Real": "13",
|
|
171
|
+
"Córdoba": "14",
|
|
172
|
+
"Cuenca": "16",
|
|
173
|
+
"Gerona": "17",
|
|
174
|
+
"Gipuzkoa": "20",
|
|
175
|
+
"Girona": "17",
|
|
176
|
+
"Granada": "18",
|
|
177
|
+
"Guadalajara": "19",
|
|
178
|
+
"Guipúzcoa": "20",
|
|
179
|
+
"Huelva": "21",
|
|
180
|
+
"Huesca": "22",
|
|
181
|
+
"Illes Balears": "07",
|
|
182
|
+
"Jaén": "23",
|
|
183
|
+
"La Coruña": "15",
|
|
184
|
+
"La Rioja": "26",
|
|
185
|
+
"Las Palmas": "35",
|
|
186
|
+
"León": "24",
|
|
187
|
+
"Lérida": "25",
|
|
188
|
+
"Lleida": "25",
|
|
189
|
+
"Lugo": "27",
|
|
190
|
+
"Madrid": "28",
|
|
191
|
+
"Málaga": "29",
|
|
192
|
+
"Melilla": "52",
|
|
193
|
+
"Murcia": "30",
|
|
194
|
+
"Navarra": "31",
|
|
195
|
+
"Orense": "32",
|
|
196
|
+
"Ourense": "32",
|
|
197
|
+
"Palencia": "34",
|
|
198
|
+
"Pontevedra": "36",
|
|
199
|
+
"Salamanca": "37",
|
|
200
|
+
"Santa Cruz de Tenerife": "38",
|
|
201
|
+
"Segovia": "40",
|
|
202
|
+
"Sevilla": "41",
|
|
203
|
+
"Soria": "42",
|
|
204
|
+
"Tarragona": "43",
|
|
205
|
+
"Teruel": "44",
|
|
206
|
+
"Toledo": "45",
|
|
207
|
+
"València": "46",
|
|
208
|
+
"Valencia": "46",
|
|
209
|
+
"Valladolid": "47",
|
|
210
|
+
"Vizcaya": "48",
|
|
211
|
+
"Zamora": "49",
|
|
212
|
+
"Zaragoza": "50",
|
|
213
|
+
},
|
|
214
|
+
"pt": {
|
|
215
|
+
"Aveiro": "3",
|
|
216
|
+
"Beja": "7",
|
|
217
|
+
"Braga": "4",
|
|
218
|
+
"Bragança": "5",
|
|
219
|
+
"Castelo Branco": "6",
|
|
220
|
+
"Coimbra": "3",
|
|
221
|
+
"Évora": "7",
|
|
222
|
+
"Faro": "8",
|
|
223
|
+
"Guarda": "6",
|
|
224
|
+
"Leiria": "2",
|
|
225
|
+
"Lisboa": "1",
|
|
226
|
+
"Portalegre": "7",
|
|
227
|
+
"Porto": "4",
|
|
228
|
+
"Santarém": "2",
|
|
229
|
+
"Setúbal": "2",
|
|
230
|
+
"Viana do Castelo": "4",
|
|
231
|
+
"Vila Real": "5",
|
|
232
|
+
"Viseu": "3",
|
|
233
|
+
"Açores": "9",
|
|
234
|
+
"Madeira": "9",
|
|
235
|
+
},
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
POSTCODE_TO_STATES = {
|
|
240
|
+
"es": {
|
|
241
|
+
# Andalucía
|
|
242
|
+
"04": "Andalucía",
|
|
243
|
+
"11": "Andalucía",
|
|
244
|
+
"14": "Andalucía",
|
|
245
|
+
"18": "Andalucía",
|
|
246
|
+
"21": "Andalucía",
|
|
247
|
+
"23": "Andalucía",
|
|
248
|
+
"29": "Andalucía",
|
|
249
|
+
"41": "Andalucía",
|
|
250
|
+
# Aragón
|
|
251
|
+
"22": "Aragón",
|
|
252
|
+
"44": "Aragón",
|
|
253
|
+
"50": "Aragón",
|
|
254
|
+
# Asturias
|
|
255
|
+
"33": "Principado de Asturias",
|
|
256
|
+
# Baleares
|
|
257
|
+
"07": "Islas Baleares",
|
|
258
|
+
# Canarias
|
|
259
|
+
"35": "Canarias",
|
|
260
|
+
"38": "Canarias",
|
|
261
|
+
# Cantabria
|
|
262
|
+
"39": "Cantabria",
|
|
263
|
+
# Castilla y León
|
|
264
|
+
"05": "Castilla y León",
|
|
265
|
+
"09": "Castilla y León",
|
|
266
|
+
"24": "Castilla y León",
|
|
267
|
+
"34": "Castilla y León",
|
|
268
|
+
"37": "Castilla y León",
|
|
269
|
+
"40": "Castilla y León",
|
|
270
|
+
"42": "Castilla y León",
|
|
271
|
+
"47": "Castilla y León",
|
|
272
|
+
"49": "Castilla y León",
|
|
273
|
+
# Castilla-La Mancha
|
|
274
|
+
"02": "Castilla-La Mancha",
|
|
275
|
+
"13": "Castilla-La Mancha",
|
|
276
|
+
"16": "Castilla-La Mancha",
|
|
277
|
+
"19": "Castilla-La Mancha",
|
|
278
|
+
"45": "Castilla-La Mancha",
|
|
279
|
+
# Cataluña
|
|
280
|
+
"08": "Cataluña",
|
|
281
|
+
"17": "Cataluña",
|
|
282
|
+
"25": "Cataluña",
|
|
283
|
+
"43": "Cataluña",
|
|
284
|
+
# Comunidad Valenciana
|
|
285
|
+
"03": "Comunidad Valenciana",
|
|
286
|
+
"12": "Comunidad Valenciana",
|
|
287
|
+
"46": "Comunidad Valenciana",
|
|
288
|
+
# Extremadura
|
|
289
|
+
"06": "Extremadura",
|
|
290
|
+
"10": "Extremadura",
|
|
291
|
+
# Galicia
|
|
292
|
+
"15": "Galicia",
|
|
293
|
+
"27": "Galicia",
|
|
294
|
+
"32": "Galicia",
|
|
295
|
+
"36": "Galicia",
|
|
296
|
+
# Madrid
|
|
297
|
+
"28": "Comunidad de Madrid",
|
|
298
|
+
# Murcia
|
|
299
|
+
"30": "Región de Murcia",
|
|
300
|
+
# Navarra
|
|
301
|
+
"31": "Comunidad Foral de Navarra",
|
|
302
|
+
# País Vasco
|
|
303
|
+
"01": "País Vasco",
|
|
304
|
+
"20": "País Vasco",
|
|
305
|
+
"48": "País Vasco",
|
|
306
|
+
# La Rioja
|
|
307
|
+
"26": "La Rioja",
|
|
308
|
+
# Ciudades Autónomas
|
|
309
|
+
"51": "Ceuta",
|
|
310
|
+
"52": "Melilla",
|
|
311
|
+
},
|
|
312
|
+
"pt": { # --- NORTE ---
|
|
313
|
+
"40": "Porto",
|
|
314
|
+
"41": "Porto",
|
|
315
|
+
"42": "Porto",
|
|
316
|
+
"43": "Porto",
|
|
317
|
+
"44": "Porto",
|
|
318
|
+
"45": "Aveiro", # Concelhos do norte de Aveiro, na fronteira com Porto.
|
|
319
|
+
"47": "Braga",
|
|
320
|
+
"48": "Braga", # Guimarães.
|
|
321
|
+
"49": "Viana do Castelo",
|
|
322
|
+
"50": "Vila Real",
|
|
323
|
+
"51": "Vila Real",
|
|
324
|
+
"52": "Vila Real",
|
|
325
|
+
"53": "Vila Real / Bragança", # Zona fronteiriça.
|
|
326
|
+
"54": "Bragança",
|
|
327
|
+
# --- CENTRO ---
|
|
328
|
+
"60": "Castelo Branco",
|
|
329
|
+
"61": "Castelo Branco",
|
|
330
|
+
"62": "Castelo Branco",
|
|
331
|
+
"63": "Guarda",
|
|
332
|
+
"30": "Coimbra",
|
|
333
|
+
"31": "Coimbra",
|
|
334
|
+
"32": "Coimbra",
|
|
335
|
+
"33": "Coimbra",
|
|
336
|
+
"34": "Viseu",
|
|
337
|
+
"35": "Viseu",
|
|
338
|
+
"37": "Aveiro",
|
|
339
|
+
"38": "Aveiro",
|
|
340
|
+
"24": "Leiria",
|
|
341
|
+
# --- ÁREA METROPOLITANA DE LISBOA e arredores ---
|
|
342
|
+
"10": "Lisboa",
|
|
343
|
+
"11": "Lisboa",
|
|
344
|
+
"12": "Lisboa",
|
|
345
|
+
"13": "Lisboa",
|
|
346
|
+
"14": "Lisboa",
|
|
347
|
+
"15": "Lisboa",
|
|
348
|
+
"16": "Lisboa",
|
|
349
|
+
"17": "Lisboa",
|
|
350
|
+
"18": "Lisboa",
|
|
351
|
+
"19": "Lisboa",
|
|
352
|
+
"20": "Santarém",
|
|
353
|
+
"21": "Santarém",
|
|
354
|
+
"22": "Santarém",
|
|
355
|
+
"23": "Santarém", # Tomar e Torres Novas.
|
|
356
|
+
"25": "Lisboa", # Concelhos como Torres Vedras, Mafra, Alenquer.
|
|
357
|
+
"26": "Lisboa", # Concelhos como Loures, Amadora, Odivelas.
|
|
358
|
+
"27": "Lisboa", # Concelhos como Sintra, Cascais, Oeiras.
|
|
359
|
+
"28": "Setúbal",
|
|
360
|
+
"29": "Setúbal",
|
|
361
|
+
# --- ALENTEJO ---
|
|
362
|
+
"70": "Évora",
|
|
363
|
+
"71": "Évora",
|
|
364
|
+
"72": "Évora",
|
|
365
|
+
"73": "Portalegre",
|
|
366
|
+
"74": "Portalegre",
|
|
367
|
+
"75": "Setúbal", # Litoral Alentejano (Sines, Grândola), administrativamente de Setúbal.
|
|
368
|
+
"76": "Beja",
|
|
369
|
+
"77": "Beja",
|
|
370
|
+
"78": "Beja",
|
|
371
|
+
"79": "Beja",
|
|
372
|
+
# --- ALGARVE ---
|
|
373
|
+
"80": "Faro",
|
|
374
|
+
"81": "Faro",
|
|
375
|
+
"82": "Faro",
|
|
376
|
+
"83": "Faro",
|
|
377
|
+
"84": "Faro",
|
|
378
|
+
"85": "Faro",
|
|
379
|
+
"86": "Faro",
|
|
380
|
+
"87": "Faro",
|
|
381
|
+
"88": "Faro",
|
|
382
|
+
"89": "Faro",
|
|
383
|
+
# --- REGIÕES AUTÓNOMAS ---
|
|
384
|
+
"90": "Madeira",
|
|
385
|
+
"91": "Madeira",
|
|
386
|
+
"92": "Madeira",
|
|
387
|
+
"93": "Madeira",
|
|
388
|
+
"95": "Açores", # Ilha de São Miguel (Ponta Delgada).
|
|
389
|
+
"96": "Açores", # Ilha de São Miguel (Ribeira Grande) e Santa Maria.
|
|
390
|
+
"97": "Açores", # Ilha Terceira (Angra do Heroísmo).
|
|
391
|
+
"98": "Açores", # Ilhas de São Jorge, Graciosa, Faial, Pico.
|
|
392
|
+
"99": "Açores", # Ilhas de Flores e Corvo.
|
|
393
|
+
},
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
_NORMALIZED_PROVINCE_CACHE = {}
|
|
397
|
+
for country, provinces in PROVINCE_TO_POSTCODE.items():
|
|
398
|
+
# Get the original keys (e.g., "A Coruña", "Álava")
|
|
399
|
+
original_keys = list(provinces.keys())
|
|
400
|
+
|
|
401
|
+
# Create the normalized list (e.g., "a coruna", "alava")
|
|
402
|
+
normalized_choices = [unidecode(p).lower() for p in original_keys]
|
|
403
|
+
|
|
404
|
+
_NORMALIZED_PROVINCE_CACHE[country] = {
|
|
405
|
+
"choices": normalized_choices, # The list for rapidfuzz to search in
|
|
406
|
+
"keys": original_keys, # The list to find the name by index
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
# Source: https://github.com/ariankoochak/regex-patterns-of-all-countries
|
|
410
|
+
COUNTRY_PARSING_RULES = {
|
|
411
|
+
"es": {
|
|
412
|
+
"zip_validate_pattern": re.compile(r"^\d{5}$"),
|
|
413
|
+
"zip_search_pattern": re.compile(r"\b\d{5}\b"),
|
|
414
|
+
"phone_validate_pattern": re.compile(r"^(\+?34)?[67]\d{8}$"),
|
|
415
|
+
},
|
|
416
|
+
"pt": {
|
|
417
|
+
"zip_validate_pattern": re.compile(r"^\d{4}[- ]{0,1}\d{3}$|^\d{4}$"),
|
|
418
|
+
"zip_search_pattern": re.compile(r"\b\d{4}[- ]?\d{3}\b|\b\d{4}\b"),
|
|
419
|
+
"phone_validate_pattern": re.compile(r"^(\+?351)?9[1236]\d{7}$"),
|
|
420
|
+
},
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
MADRID_DISTRICT_DIRECT_PATCH = {
|
|
424
|
+
# Correcciones directas
|
|
425
|
+
"Aravaca": "Moncloa-Aravaca",
|
|
426
|
+
"Puerta de Hierro": "Fuencarral-El Pardo",
|
|
427
|
+
"Palacio": "Centro",
|
|
428
|
+
"Argüelles": "Moncloa-Aravaca",
|
|
429
|
+
"Barrio de La Estación": "Latina",
|
|
430
|
+
"Casa de Campo": "Moncloa-Aravaca",
|
|
431
|
+
"Universidad": "Centro",
|
|
432
|
+
"Valdezarza": "Moncloa-Aravaca",
|
|
433
|
+
"Cortes": "Centro",
|
|
434
|
+
"Barrio de la Latina": "Centro",
|
|
435
|
+
"Ciudad Universitaria": "Moncloa-Aravaca",
|
|
436
|
+
"Embajadores": "Centro",
|
|
437
|
+
"Justicia": "Centro",
|
|
438
|
+
"Sol": "Centro",
|
|
439
|
+
"Barrio de los Austrias": "Centro",
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
MADRID_DISTRICT_QUARTER_PATCH = {
|
|
443
|
+
# Reglas dependientes del quarter
|
|
444
|
+
("Centro", "Atocha"): "Arganzuela",
|
|
445
|
+
("Centro", "Gaztambide"): "Chamberí",
|
|
446
|
+
("Centro", "Imperial"): "Arganzuela",
|
|
447
|
+
("Centro", "Palos de Moguer"): "Arganzuela",
|
|
448
|
+
("Arganzuela", "Embajadores"): "Centro",
|
|
449
|
+
("Salamanca", "La Elipa"): "Ciudad Lineal",
|
|
450
|
+
("Salamanca", "Ventas"): "Ciudad Lineal",
|
|
451
|
+
("Tetuán", "La Paz"): "Fuencarral-El Pardo",
|
|
452
|
+
("Tetuán", "San Cristóbal"): "Villaverde",
|
|
453
|
+
("Tetuán", "Colonia de San Cristóbal"): "Villaverde",
|
|
454
|
+
("Tetuán", "Valdezarza"): "Moncloa-Aravaca",
|
|
455
|
+
("Chamberí", "Ciudad Universitaria"): "Moncloa-Aravaca",
|
|
456
|
+
("Chamberí", "Justicia"): "Centro",
|
|
457
|
+
("Chamberí", "Universidad"): "Centro",
|
|
458
|
+
("Fuencarral-El Pardo", "Castilla"): "Chamartín",
|
|
459
|
+
("Fuencarral-El Pardo", "Valdeacederas"): "Tetuán",
|
|
460
|
+
("Fuencarral-El Pardo", "Valdezarza"): "Moncloa-Aravaca",
|
|
461
|
+
("Moncloa-Aravaca", "Bellas Vistas"): "Tetuán",
|
|
462
|
+
("Moncloa-Aravaca", "Berruguete"): "Tetuán",
|
|
463
|
+
("Moncloa-Aravaca", "Campamento"): "Latina",
|
|
464
|
+
("Moncloa-Aravaca", "Gaztambide"): "Chamberí",
|
|
465
|
+
("Moncloa-Aravaca", "Lucero"): "Latina",
|
|
466
|
+
("Moncloa-Aravaca", "Valdeacederas"): "Tetuán",
|
|
467
|
+
("Moncloa-Aravaca", "Vallehermoso"): "Chamberí",
|
|
468
|
+
("Latina", "Casa de Campo"): "Moncloa-Aravaca",
|
|
469
|
+
("Villaverde", "San Fermín"): "Usera",
|
|
470
|
+
("San Blas - Canillejas", "Concepción"): "Ciudad Lineal",
|
|
471
|
+
("San Blas - Canillejas", "Quintana"): "Ciudad Lineal",
|
|
472
|
+
("Barajas", "Palomas"): "Hortaleza",
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
MADRID_QUARTER_DIRECT_PATCH = {
|
|
476
|
+
"Barrio de la Latina": "Palacio",
|
|
477
|
+
"Barrio de las Letras": "Cortes",
|
|
478
|
+
"Barrio de los Austrias": "Palacio",
|
|
479
|
+
"Colonia de San Cristóbal": "San Cristóbal",
|
|
480
|
+
"Encinar de los Reyes": "Valdefuentes",
|
|
481
|
+
"La Elipa": "Ventas",
|
|
482
|
+
"Las Cárcavas - San Antonio": "Valdefuentes",
|
|
483
|
+
"Lavapiés": "Embajadores",
|
|
484
|
+
"Montecarmelo": "El Goloso",
|
|
485
|
+
"Puerta de Hierro": "Ciudad Universitaria",
|
|
486
|
+
"Villaverde Alto, Casco Histórico de Villaverde": "San Andrés",
|
|
487
|
+
"Villaverde Bajo": "Los Rosales",
|
|
488
|
+
"Virgen del Cortijo": "Valdefuentes",
|
|
489
|
+
"Las Acacias": "Acacias",
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
# Cutoff score for rapidfuzz in the name standardization function
|
|
493
|
+
STANDARD_THRESHOLD = 40
|
datamarket/utils/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
from .main import *
|
|
1
|
+
from .main import * # noqa: F403
|
datamarket/utils/airflow.py
CHANGED
|
@@ -3,20 +3,23 @@
|
|
|
3
3
|
|
|
4
4
|
import re
|
|
5
5
|
import unicodedata
|
|
6
|
+
|
|
6
7
|
import inflection
|
|
7
8
|
|
|
8
9
|
########################################################################################################################
|
|
9
10
|
# FUNCTIONS
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
def process_task_name(task_id):
|
|
12
|
-
task_id =
|
|
13
|
-
f"_{unicodedata.name(c)}_" if not c.isalnum() else c
|
|
14
|
-
|
|
14
|
+
task_id = "".join(
|
|
15
|
+
f"_{unicodedata.name(c)}_" if not c.isalnum() else c
|
|
16
|
+
for c in task_id
|
|
17
|
+
if c.isalnum() or (unicodedata.category(c) not in ("Cc", "Cf", "Cs", "Co", "Cn"))
|
|
15
18
|
)
|
|
16
|
-
task_id = inflection.parameterize(task_id, separator=
|
|
19
|
+
task_id = inflection.parameterize(task_id, separator="_")
|
|
17
20
|
task_id = task_id.lower()
|
|
18
|
-
task_id = task_id.strip(
|
|
19
|
-
task_id = re.sub(r
|
|
21
|
+
task_id = task_id.strip("_")
|
|
22
|
+
task_id = re.sub(r"_+", "_", task_id)
|
|
20
23
|
if task_id[0].isdigit():
|
|
21
|
-
task_id =
|
|
24
|
+
task_id = "task_" + task_id
|
|
22
25
|
return task_id
|
datamarket/utils/alchemy.py
CHANGED
|
@@ -8,6 +8,7 @@ from sqlalchemy.ext.declarative import declarative_base
|
|
|
8
8
|
|
|
9
9
|
Base = declarative_base()
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
class View(Base):
|
|
12
13
|
__abstract__ = True
|
|
13
14
|
is_view = True
|
|
@@ -19,4 +20,4 @@ class View(Base):
|
|
|
19
20
|
"""
|
|
20
21
|
conn.execute(f"""
|
|
21
22
|
CREATE OR REPLACE VIEW {cls.__tablename__} AS {query}
|
|
22
|
-
""")
|
|
23
|
+
""")
|
datamarket/utils/logs.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from typing import Any, TypeAlias
|
|
2
|
+
|
|
3
|
+
# --- Type Definitions ---
|
|
4
|
+
AnsiCode: TypeAlias = str
|
|
5
|
+
StyleCode: TypeAlias = AnsiCode # e.g. BOLD, UNDERLINE
|
|
6
|
+
ShadeCode: TypeAlias = AnsiCode # e.g. GREEN, BLUE
|
|
7
|
+
ColorCode: TypeAlias = AnsiCode # Final combined result
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Color:
|
|
11
|
+
"""Raw ANSI escape codes. Internal building blocks for the library."""
|
|
12
|
+
|
|
13
|
+
RESET: AnsiCode = "\033[0m"
|
|
14
|
+
|
|
15
|
+
# Styles (Combinable)
|
|
16
|
+
BOLD: StyleCode = "\033[1m"
|
|
17
|
+
UNDERLINE: StyleCode = "\033[4m"
|
|
18
|
+
|
|
19
|
+
# --- THE 14 PROTECTED SHADES ---
|
|
20
|
+
# Standard (Deeper tones)
|
|
21
|
+
S_GREEN: ShadeCode = "\033[32m"
|
|
22
|
+
S_BLUE: ShadeCode = "\033[34m"
|
|
23
|
+
S_PURPLE: ShadeCode = "\033[35m"
|
|
24
|
+
S_CYAN: ShadeCode = "\033[36m"
|
|
25
|
+
S_WHITE: ShadeCode = "\033[37m"
|
|
26
|
+
|
|
27
|
+
# High-Intensity (Vibrant tones)
|
|
28
|
+
H_GREY: ShadeCode = "\033[90m"
|
|
29
|
+
H_GREEN: ShadeCode = "\033[92m"
|
|
30
|
+
H_BLUE: ShadeCode = "\033[94m"
|
|
31
|
+
H_PURPLE: ShadeCode = "\033[95m"
|
|
32
|
+
H_CYAN: ShadeCode = "\033[96m"
|
|
33
|
+
H_WHITE: ShadeCode = "\033[97m"
|
|
34
|
+
|
|
35
|
+
# Extended Palette
|
|
36
|
+
TEAL: ShadeCode = "\033[38;5;30m"
|
|
37
|
+
LAVENDER: ShadeCode = "\033[38;5;147m"
|
|
38
|
+
OLIVE: ShadeCode = "\033[38;5;64m"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def combine(*codes: AnsiCode) -> ColorCode:
|
|
42
|
+
"""Combines multiple ANSI codes (e.g., BOLD + GREEN)."""
|
|
43
|
+
return "".join(codes)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SystemColor:
|
|
47
|
+
"""
|
|
48
|
+
RESERVED: For core library internals.
|
|
49
|
+
Strictly BOLD or UNDERLINED to distinguish from scraper data.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
# Verbs / Actions (BOLD)
|
|
53
|
+
BATCH_PIPELINE_STATS: ColorCode = combine(Color.BOLD, Color.H_PURPLE)
|
|
54
|
+
|
|
55
|
+
# State / Nouns (UNDERLINED)
|
|
56
|
+
PROCESS_BATCH_PROGRESS: ColorCode = combine(Color.UNDERLINE, Color.S_CYAN)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ScraperColor:
|
|
60
|
+
"""
|
|
61
|
+
USER-FACING: Standard colors for everyday scraper logic.
|
|
62
|
+
Raw shades only. It is up to the user how to apply these.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
GREY: ShadeCode = Color.H_GREY
|
|
66
|
+
EMERALD: ShadeCode = Color.H_GREEN
|
|
67
|
+
FOREST: ShadeCode = Color.S_GREEN
|
|
68
|
+
SKY: ShadeCode = Color.H_BLUE
|
|
69
|
+
NAVY: ShadeCode = Color.S_BLUE
|
|
70
|
+
VIOLET: ShadeCode = Color.H_PURPLE
|
|
71
|
+
PLUM: ShadeCode = Color.S_PURPLE
|
|
72
|
+
CYAN: ShadeCode = Color.H_CYAN
|
|
73
|
+
TEAL: ShadeCode = Color.S_CYAN
|
|
74
|
+
WHITE: ShadeCode = Color.H_WHITE
|
|
75
|
+
SILVER: ShadeCode = Color.S_WHITE
|
|
76
|
+
LAVENDER: ShadeCode = Color.LAVENDER
|
|
77
|
+
OLIVE: ShadeCode = Color.OLIVE
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def colorize(text: Any, color_code: ColorCode) -> str:
|
|
81
|
+
"""
|
|
82
|
+
Wraps text in ANSI color codes.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
text: The content to colorize (supports any type).
|
|
86
|
+
color_code: A ShadeCode, StyleCode, or combined ColorCode.
|
|
87
|
+
"""
|
|
88
|
+
return f"{color_code}{text}{Color.RESET}"
|