datamarket 0.7.41__py3-none-any.whl → 0.7.125__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamarket/__init__.py +0 -1
- datamarket/exceptions/__init__.py +1 -0
- datamarket/exceptions/main.py +118 -0
- datamarket/interfaces/alchemy.py +206 -64
- datamarket/interfaces/aws.py +81 -14
- datamarket/interfaces/azure.py +127 -0
- datamarket/interfaces/drive.py +60 -10
- datamarket/interfaces/ftp.py +23 -16
- datamarket/interfaces/nominatim.py +312 -39
- datamarket/interfaces/peerdb.py +40 -5
- datamarket/interfaces/proxy.py +354 -50
- datamarket/interfaces/tinybird.py +4 -12
- datamarket/params/nominatim.py +439 -0
- datamarket/utils/__init__.py +1 -1
- datamarket/utils/airflow.py +10 -7
- datamarket/utils/alchemy.py +2 -1
- datamarket/utils/main.py +127 -6
- datamarket/utils/nominatim.py +201 -0
- datamarket/utils/playwright/__init__.py +0 -0
- datamarket/utils/playwright/async_api.py +274 -0
- datamarket/utils/playwright/sync_api.py +281 -0
- datamarket/utils/requests.py +653 -0
- datamarket/utils/selenium.py +6 -12
- datamarket/utils/strings/__init__.py +1 -0
- datamarket/utils/strings/normalization.py +217 -0
- datamarket/utils/strings/obfuscation.py +153 -0
- datamarket/utils/strings/standardization.py +40 -0
- datamarket/utils/typer.py +2 -1
- datamarket/utils/types.py +1 -0
- {datamarket-0.7.41.dist-info → datamarket-0.7.125.dist-info}/METADATA +38 -18
- datamarket-0.7.125.dist-info/RECORD +36 -0
- {datamarket-0.7.41.dist-info → datamarket-0.7.125.dist-info}/WHEEL +1 -1
- datamarket-0.7.41.dist-info/RECORD +0 -23
- {datamarket-0.7.41.dist-info → datamarket-0.7.125.dist-info}/LICENSE +0 -0
datamarket/params/nominatim.py
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from unidecode import unidecode
|
|
4
|
+
|
|
5
|
+
CITY_TO_PROVINCE = {"Madrid": "Madrid"}
|
|
6
|
+
|
|
1
7
|
POSTCODES = {
|
|
2
8
|
"01": "Álava",
|
|
3
9
|
"02": "Albacete",
|
|
@@ -52,3 +58,436 @@ POSTCODES = {
|
|
|
52
58
|
"51": "Ceuta",
|
|
53
59
|
"52": "Melilla",
|
|
54
60
|
}
|
|
61
|
+
|
|
62
|
+
# Mapping of normalized names (for comparison) to standardized names (for storing)
|
|
63
|
+
# for each corresponding country code
|
|
64
|
+
STATES = {
|
|
65
|
+
"es": {
|
|
66
|
+
"andalucia": "Andalucía",
|
|
67
|
+
"aragon": "Aragón",
|
|
68
|
+
"asturias": "Asturias",
|
|
69
|
+
"baleares": "Baleares",
|
|
70
|
+
"canarias": "Canarias",
|
|
71
|
+
"cantabria": "Cantabria",
|
|
72
|
+
"castilla la mancha": "Castilla-La Mancha",
|
|
73
|
+
"castilla y leon": "Castilla y León",
|
|
74
|
+
"cataluna": "Cataluña",
|
|
75
|
+
"ceuta": "Ceuta",
|
|
76
|
+
"comunidad valenciana": "Comunidad Valenciana",
|
|
77
|
+
"extremadura": "Extremadura",
|
|
78
|
+
"galicia": "Galicia",
|
|
79
|
+
"la rioja": "La Rioja",
|
|
80
|
+
"madrid": "Comunidad de Madrid",
|
|
81
|
+
"melilla": "Melilla",
|
|
82
|
+
"murcia": "Murcia",
|
|
83
|
+
"navarra": "Navarra",
|
|
84
|
+
"pais vasco": "País Vasco",
|
|
85
|
+
"euskadi": "País Vasco", # Alias not caught by rapidfuzz
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
PROVINCES = {
|
|
90
|
+
"es": {
|
|
91
|
+
"alava": "Álava",
|
|
92
|
+
"araba": "Álava", # Alias not caught by rapidfuzz
|
|
93
|
+
"albacete": "Albacete",
|
|
94
|
+
"alicante": "Alicante",
|
|
95
|
+
"almeria": "Almería",
|
|
96
|
+
"asturias": "Asturias",
|
|
97
|
+
"avila": "Ávila",
|
|
98
|
+
"badajoz": "Badajoz",
|
|
99
|
+
"barcelona": "Barcelona",
|
|
100
|
+
"bizkaia": "Vizcaya",
|
|
101
|
+
"burgos": "Burgos",
|
|
102
|
+
"caceres": "Cáceres",
|
|
103
|
+
"cadiz": "Cádiz",
|
|
104
|
+
"cantabria": "Cantabria",
|
|
105
|
+
"castellon": "Castellón",
|
|
106
|
+
"ceuta": "Ceuta", # Considered province by opensm and/or geonames
|
|
107
|
+
"ciudad real": "Ciudad Real",
|
|
108
|
+
"cordoba": "Córdoba",
|
|
109
|
+
"cuenca": "Cuenca",
|
|
110
|
+
"gipuzkoa": "Gipuzkoa",
|
|
111
|
+
"gerona": "Gerona",
|
|
112
|
+
"granada": "Granada",
|
|
113
|
+
"guadalajara": "Guadalajara",
|
|
114
|
+
"huelva": "Huelva",
|
|
115
|
+
"huesca": "Huesca",
|
|
116
|
+
"islas baleares": "Islas Baleares",
|
|
117
|
+
"jaen": "Jaén",
|
|
118
|
+
"la coruna": "La Coruña",
|
|
119
|
+
"la rioja": "La Rioja",
|
|
120
|
+
"las palmas": "Las Palmas",
|
|
121
|
+
"leon": "León",
|
|
122
|
+
"lerida": "Lérida",
|
|
123
|
+
"lugo": "Lugo",
|
|
124
|
+
"madrid": "Madrid",
|
|
125
|
+
"malaga": "Málaga",
|
|
126
|
+
"melilla": "Melilla", # Considered province by opensm and/or geonames
|
|
127
|
+
"murcia": "Murcia",
|
|
128
|
+
"navarra": "Navarra",
|
|
129
|
+
"orense": "Orense",
|
|
130
|
+
"palencia": "Palencia",
|
|
131
|
+
"pontevedra": "Pontevedra",
|
|
132
|
+
"salamanca": "Salamanca",
|
|
133
|
+
"santa cruz de tenerife": "Santa Cruz de Tenerife",
|
|
134
|
+
"segovia": "Segovia",
|
|
135
|
+
"sevilla": "Sevilla",
|
|
136
|
+
"soria": "Soria",
|
|
137
|
+
"tarragona": "Tarragona",
|
|
138
|
+
"teruel": "Teruel",
|
|
139
|
+
"toledo": "Toledo",
|
|
140
|
+
"valencia": "Valencia",
|
|
141
|
+
"valladolid": "Valladolid",
|
|
142
|
+
"zamora": "Zamora",
|
|
143
|
+
"zaragoza": "Zaragoza",
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
PROVINCE_TO_POSTCODE = {
|
|
149
|
+
"es": {
|
|
150
|
+
"A Coruña": "15",
|
|
151
|
+
"Álava": "01",
|
|
152
|
+
"Araba": "01",
|
|
153
|
+
"Alacant": "03",
|
|
154
|
+
"Alicante": "03",
|
|
155
|
+
"Albacete": "02",
|
|
156
|
+
"Almería": "04",
|
|
157
|
+
"Asturias": "33",
|
|
158
|
+
"Ávila": "05",
|
|
159
|
+
"Badajoz": "06",
|
|
160
|
+
"Baleares": "07",
|
|
161
|
+
"Barcelona": "08",
|
|
162
|
+
"Bizkaia": "48",
|
|
163
|
+
"Burgos": "09",
|
|
164
|
+
"Cáceres": "10",
|
|
165
|
+
"Cádiz": "11",
|
|
166
|
+
"Cantabria": "39",
|
|
167
|
+
"Castelló": "12",
|
|
168
|
+
"Castellón": "12",
|
|
169
|
+
"Ceuta": "51",
|
|
170
|
+
"Ciudad Real": "13",
|
|
171
|
+
"Córdoba": "14",
|
|
172
|
+
"Cuenca": "16",
|
|
173
|
+
"Gerona": "17",
|
|
174
|
+
"Gipuzkoa": "20",
|
|
175
|
+
"Girona": "17",
|
|
176
|
+
"Granada": "18",
|
|
177
|
+
"Guadalajara": "19",
|
|
178
|
+
"Guipúzcoa": "20",
|
|
179
|
+
"Huelva": "21",
|
|
180
|
+
"Huesca": "22",
|
|
181
|
+
"Illes Balears": "07",
|
|
182
|
+
"Jaén": "23",
|
|
183
|
+
"La Coruña": "15",
|
|
184
|
+
"La Rioja": "26",
|
|
185
|
+
"Las Palmas": "35",
|
|
186
|
+
"León": "24",
|
|
187
|
+
"Lérida": "25",
|
|
188
|
+
"Lleida": "25",
|
|
189
|
+
"Lugo": "27",
|
|
190
|
+
"Madrid": "28",
|
|
191
|
+
"Málaga": "29",
|
|
192
|
+
"Melilla": "52",
|
|
193
|
+
"Murcia": "30",
|
|
194
|
+
"Navarra": "31",
|
|
195
|
+
"Orense": "32",
|
|
196
|
+
"Ourense": "32",
|
|
197
|
+
"Palencia": "34",
|
|
198
|
+
"Pontevedra": "36",
|
|
199
|
+
"Salamanca": "37",
|
|
200
|
+
"Santa Cruz de Tenerife": "38",
|
|
201
|
+
"Segovia": "40",
|
|
202
|
+
"Sevilla": "41",
|
|
203
|
+
"Soria": "42",
|
|
204
|
+
"Tarragona": "43",
|
|
205
|
+
"Teruel": "44",
|
|
206
|
+
"Toledo": "45",
|
|
207
|
+
"València": "46",
|
|
208
|
+
"Valencia": "46",
|
|
209
|
+
"Valladolid": "47",
|
|
210
|
+
"Vizcaya": "48",
|
|
211
|
+
"Zamora": "49",
|
|
212
|
+
"Zaragoza": "50",
|
|
213
|
+
},
|
|
214
|
+
"pt": {
|
|
215
|
+
"Aveiro": "3",
|
|
216
|
+
"Beja": "7",
|
|
217
|
+
"Braga": "4",
|
|
218
|
+
"Bragança": "5",
|
|
219
|
+
"Castelo Branco": "6",
|
|
220
|
+
"Coimbra": "3",
|
|
221
|
+
"Évora": "7",
|
|
222
|
+
"Faro": "8",
|
|
223
|
+
"Guarda": "6",
|
|
224
|
+
"Leiria": "2",
|
|
225
|
+
"Lisboa": "1",
|
|
226
|
+
"Portalegre": "7",
|
|
227
|
+
"Porto": "4",
|
|
228
|
+
"Santarém": "2",
|
|
229
|
+
"Setúbal": "2",
|
|
230
|
+
"Viana do Castelo": "4",
|
|
231
|
+
"Vila Real": "5",
|
|
232
|
+
"Viseu": "3",
|
|
233
|
+
"Açores": "9",
|
|
234
|
+
"Madeira": "9",
|
|
235
|
+
},
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
POSTCODE_TO_STATES = {
|
|
240
|
+
"es": {
|
|
241
|
+
# Andalucía
|
|
242
|
+
"04": "Andalucía",
|
|
243
|
+
"11": "Andalucía",
|
|
244
|
+
"14": "Andalucía",
|
|
245
|
+
"18": "Andalucía",
|
|
246
|
+
"21": "Andalucía",
|
|
247
|
+
"23": "Andalucía",
|
|
248
|
+
"29": "Andalucía",
|
|
249
|
+
"41": "Andalucía",
|
|
250
|
+
# Aragón
|
|
251
|
+
"22": "Aragón",
|
|
252
|
+
"44": "Aragón",
|
|
253
|
+
"50": "Aragón",
|
|
254
|
+
# Asturias
|
|
255
|
+
"33": "Principado de Asturias",
|
|
256
|
+
# Baleares
|
|
257
|
+
"07": "Islas Baleares",
|
|
258
|
+
# Canarias
|
|
259
|
+
"35": "Canarias",
|
|
260
|
+
"38": "Canarias",
|
|
261
|
+
# Cantabria
|
|
262
|
+
"39": "Cantabria",
|
|
263
|
+
# Castilla y León
|
|
264
|
+
"05": "Castilla y León",
|
|
265
|
+
"09": "Castilla y León",
|
|
266
|
+
"24": "Castilla y León",
|
|
267
|
+
"34": "Castilla y León",
|
|
268
|
+
"37": "Castilla y León",
|
|
269
|
+
"40": "Castilla y León",
|
|
270
|
+
"42": "Castilla y León",
|
|
271
|
+
"47": "Castilla y León",
|
|
272
|
+
"49": "Castilla y León",
|
|
273
|
+
# Castilla-La Mancha
|
|
274
|
+
"02": "Castilla-La Mancha",
|
|
275
|
+
"13": "Castilla-La Mancha",
|
|
276
|
+
"16": "Castilla-La Mancha",
|
|
277
|
+
"19": "Castilla-La Mancha",
|
|
278
|
+
"45": "Castilla-La Mancha",
|
|
279
|
+
# Cataluña
|
|
280
|
+
"08": "Cataluña",
|
|
281
|
+
"17": "Cataluña",
|
|
282
|
+
"25": "Cataluña",
|
|
283
|
+
"43": "Cataluña",
|
|
284
|
+
# Comunidad Valenciana
|
|
285
|
+
"03": "Comunidad Valenciana",
|
|
286
|
+
"12": "Comunidad Valenciana",
|
|
287
|
+
"46": "Comunidad Valenciana",
|
|
288
|
+
# Extremadura
|
|
289
|
+
"06": "Extremadura",
|
|
290
|
+
"10": "Extremadura",
|
|
291
|
+
# Galicia
|
|
292
|
+
"15": "Galicia",
|
|
293
|
+
"27": "Galicia",
|
|
294
|
+
"32": "Galicia",
|
|
295
|
+
"36": "Galicia",
|
|
296
|
+
# Madrid
|
|
297
|
+
"28": "Comunidad de Madrid",
|
|
298
|
+
# Murcia
|
|
299
|
+
"30": "Región de Murcia",
|
|
300
|
+
# Navarra
|
|
301
|
+
"31": "Comunidad Foral de Navarra",
|
|
302
|
+
# País Vasco
|
|
303
|
+
"01": "País Vasco",
|
|
304
|
+
"20": "País Vasco",
|
|
305
|
+
"48": "País Vasco",
|
|
306
|
+
# La Rioja
|
|
307
|
+
"26": "La Rioja",
|
|
308
|
+
# Ciudades Autónomas
|
|
309
|
+
"51": "Ceuta",
|
|
310
|
+
"52": "Melilla",
|
|
311
|
+
},
|
|
312
|
+
"pt": { # --- NORTE ---
|
|
313
|
+
"40": "Porto",
|
|
314
|
+
"41": "Porto",
|
|
315
|
+
"42": "Porto",
|
|
316
|
+
"43": "Porto",
|
|
317
|
+
"44": "Porto",
|
|
318
|
+
"45": "Aveiro", # Concelhos do norte de Aveiro, na fronteira com Porto.
|
|
319
|
+
"47": "Braga",
|
|
320
|
+
"48": "Braga", # Guimarães.
|
|
321
|
+
"49": "Viana do Castelo",
|
|
322
|
+
"50": "Vila Real",
|
|
323
|
+
"51": "Vila Real",
|
|
324
|
+
"52": "Vila Real",
|
|
325
|
+
"53": "Vila Real / Bragança", # Zona fronteiriça.
|
|
326
|
+
"54": "Bragança",
|
|
327
|
+
# --- CENTRO ---
|
|
328
|
+
"60": "Castelo Branco",
|
|
329
|
+
"61": "Castelo Branco",
|
|
330
|
+
"62": "Castelo Branco",
|
|
331
|
+
"63": "Guarda",
|
|
332
|
+
"30": "Coimbra",
|
|
333
|
+
"31": "Coimbra",
|
|
334
|
+
"32": "Coimbra",
|
|
335
|
+
"33": "Coimbra",
|
|
336
|
+
"34": "Viseu",
|
|
337
|
+
"35": "Viseu",
|
|
338
|
+
"37": "Aveiro",
|
|
339
|
+
"38": "Aveiro",
|
|
340
|
+
"24": "Leiria",
|
|
341
|
+
# --- ÁREA METROPOLITANA DE LISBOA e arredores ---
|
|
342
|
+
"10": "Lisboa",
|
|
343
|
+
"11": "Lisboa",
|
|
344
|
+
"12": "Lisboa",
|
|
345
|
+
"13": "Lisboa",
|
|
346
|
+
"14": "Lisboa",
|
|
347
|
+
"15": "Lisboa",
|
|
348
|
+
"16": "Lisboa",
|
|
349
|
+
"17": "Lisboa",
|
|
350
|
+
"18": "Lisboa",
|
|
351
|
+
"19": "Lisboa",
|
|
352
|
+
"20": "Santarém",
|
|
353
|
+
"21": "Santarém",
|
|
354
|
+
"22": "Santarém",
|
|
355
|
+
"23": "Santarém", # Tomar e Torres Novas.
|
|
356
|
+
"25": "Lisboa", # Concelhos como Torres Vedras, Mafra, Alenquer.
|
|
357
|
+
"26": "Lisboa", # Concelhos como Loures, Amadora, Odivelas.
|
|
358
|
+
"27": "Lisboa", # Concelhos como Sintra, Cascais, Oeiras.
|
|
359
|
+
"28": "Setúbal",
|
|
360
|
+
"29": "Setúbal",
|
|
361
|
+
# --- ALENTEJO ---
|
|
362
|
+
"70": "Évora",
|
|
363
|
+
"71": "Évora",
|
|
364
|
+
"72": "Évora",
|
|
365
|
+
"73": "Portalegre",
|
|
366
|
+
"74": "Portalegre",
|
|
367
|
+
"75": "Setúbal", # Litoral Alentejano (Sines, Grândola), administrativamente de Setúbal.
|
|
368
|
+
"76": "Beja",
|
|
369
|
+
"77": "Beja",
|
|
370
|
+
"78": "Beja",
|
|
371
|
+
"79": "Beja",
|
|
372
|
+
# --- ALGARVE ---
|
|
373
|
+
"80": "Faro",
|
|
374
|
+
"81": "Faro",
|
|
375
|
+
"82": "Faro",
|
|
376
|
+
"83": "Faro",
|
|
377
|
+
"84": "Faro",
|
|
378
|
+
"85": "Faro",
|
|
379
|
+
"86": "Faro",
|
|
380
|
+
"87": "Faro",
|
|
381
|
+
"88": "Faro",
|
|
382
|
+
"89": "Faro",
|
|
383
|
+
# --- REGIÕES AUTÓNOMAS ---
|
|
384
|
+
"90": "Madeira",
|
|
385
|
+
"91": "Madeira",
|
|
386
|
+
"92": "Madeira",
|
|
387
|
+
"93": "Madeira",
|
|
388
|
+
"95": "Açores", # Ilha de São Miguel (Ponta Delgada).
|
|
389
|
+
"96": "Açores", # Ilha de São Miguel (Ribeira Grande) e Santa Maria.
|
|
390
|
+
"97": "Açores", # Ilha Terceira (Angra do Heroísmo).
|
|
391
|
+
"98": "Açores", # Ilhas de São Jorge, Graciosa, Faial, Pico.
|
|
392
|
+
"99": "Açores", # Ilhas de Flores e Corvo.
|
|
393
|
+
},
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
_NORMALIZED_PROVINCE_CACHE = {}
|
|
397
|
+
for country, provinces in PROVINCE_TO_POSTCODE.items():
|
|
398
|
+
# Get the original keys (e.g., "A Coruña", "Álava")
|
|
399
|
+
original_keys = list(provinces.keys())
|
|
400
|
+
|
|
401
|
+
# Create the normalized list (e.g., "a coruna", "alava")
|
|
402
|
+
normalized_choices = [unidecode(p).lower() for p in original_keys]
|
|
403
|
+
|
|
404
|
+
_NORMALIZED_PROVINCE_CACHE[country] = {
|
|
405
|
+
"choices": normalized_choices, # The list for rapidfuzz to search in
|
|
406
|
+
"keys": original_keys, # The list to find the name by index
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
# Source: https://github.com/ariankoochak/regex-patterns-of-all-countries
|
|
410
|
+
COUNTRY_PARSING_RULES = {
|
|
411
|
+
"es": {
|
|
412
|
+
"zip_validate_pattern": re.compile(r"^\d{5}$"),
|
|
413
|
+
"zip_search_pattern": re.compile(r"\b\d{5}\b"),
|
|
414
|
+
"phone_validate_pattern": re.compile(r"^(\+?34)?[67]\d{8}$"),
|
|
415
|
+
},
|
|
416
|
+
"pt": {
|
|
417
|
+
"zip_validate_pattern": re.compile(r"^\d{4}[- ]{0,1}\d{3}$|^\d{4}$"),
|
|
418
|
+
"zip_search_pattern": re.compile(r"\b\d{4}[- ]?\d{3}\b|\b\d{4}\b"),
|
|
419
|
+
"phone_validate_pattern": re.compile(r"^(\+?351)?9[1236]\d{7}$"),
|
|
420
|
+
},
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
MADRID_DISTRICT_DIRECT_PATCH = {
|
|
424
|
+
# Correcciones directas
|
|
425
|
+
"Aravaca": "Moncloa-Aravaca",
|
|
426
|
+
"Puerta de Hierro": "Fuencarral-El Pardo",
|
|
427
|
+
"Palacio": "Centro",
|
|
428
|
+
"Argüelles": "Moncloa-Aravaca",
|
|
429
|
+
"Barrio de La Estación": "Latina",
|
|
430
|
+
"Casa de Campo": "Moncloa-Aravaca",
|
|
431
|
+
"Universidad": "Centro",
|
|
432
|
+
"Valdezarza": "Moncloa-Aravaca",
|
|
433
|
+
"Cortes": "Centro",
|
|
434
|
+
"Barrio de la Latina": "Centro",
|
|
435
|
+
"Ciudad Universitaria": "Moncloa-Aravaca",
|
|
436
|
+
"Embajadores": "Centro",
|
|
437
|
+
"Justicia": "Centro",
|
|
438
|
+
"Sol": "Centro",
|
|
439
|
+
"Barrio de los Austrias": "Centro",
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
MADRID_DISTRICT_QUARTER_PATCH = {
|
|
443
|
+
# Reglas dependientes del quarter
|
|
444
|
+
("Centro", "Atocha"): "Arganzuela",
|
|
445
|
+
("Centro", "Gaztambide"): "Chamberí",
|
|
446
|
+
("Centro", "Imperial"): "Arganzuela",
|
|
447
|
+
("Centro", "Palos de Moguer"): "Arganzuela",
|
|
448
|
+
("Arganzuela", "Embajadores"): "Centro",
|
|
449
|
+
("Salamanca", "La Elipa"): "Ciudad Lineal",
|
|
450
|
+
("Salamanca", "Ventas"): "Ciudad Lineal",
|
|
451
|
+
("Tetuán", "La Paz"): "Fuencarral-El Pardo",
|
|
452
|
+
("Tetuán", "San Cristóbal"): "Villaverde",
|
|
453
|
+
("Tetuán", "Colonia de San Cristóbal"): "Villaverde",
|
|
454
|
+
("Tetuán", "Valdezarza"): "Moncloa-Aravaca",
|
|
455
|
+
("Chamberí", "Ciudad Universitaria"): "Moncloa-Aravaca",
|
|
456
|
+
("Chamberí", "Justicia"): "Centro",
|
|
457
|
+
("Chamberí", "Universidad"): "Centro",
|
|
458
|
+
("Fuencarral-El Pardo", "Castilla"): "Chamartín",
|
|
459
|
+
("Fuencarral-El Pardo", "Valdeacederas"): "Tetuán",
|
|
460
|
+
("Fuencarral-El Pardo", "Valdezarza"): "Moncloa-Aravaca",
|
|
461
|
+
("Moncloa-Aravaca", "Bellas Vistas"): "Tetuán",
|
|
462
|
+
("Moncloa-Aravaca", "Berruguete"): "Tetuán",
|
|
463
|
+
("Moncloa-Aravaca", "Campamento"): "Latina",
|
|
464
|
+
("Moncloa-Aravaca", "Gaztambide"): "Chamberí",
|
|
465
|
+
("Moncloa-Aravaca", "Lucero"): "Latina",
|
|
466
|
+
("Moncloa-Aravaca", "Valdeacederas"): "Tetuán",
|
|
467
|
+
("Moncloa-Aravaca", "Vallehermoso"): "Chamberí",
|
|
468
|
+
("Latina", "Casa de Campo"): "Moncloa-Aravaca",
|
|
469
|
+
("Villaverde", "San Fermín"): "Usera",
|
|
470
|
+
("San Blas - Canillejas", "Concepción"): "Ciudad Lineal",
|
|
471
|
+
("San Blas - Canillejas", "Quintana"): "Ciudad Lineal",
|
|
472
|
+
("Barajas", "Palomas"): "Hortaleza",
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
MADRID_QUARTER_DIRECT_PATCH = {
|
|
476
|
+
"Barrio de la Latina": "Palacio",
|
|
477
|
+
"Barrio de las Letras": "Cortes",
|
|
478
|
+
"Barrio de los Austrias": "Palacio",
|
|
479
|
+
"Colonia de San Cristóbal": "San Cristóbal",
|
|
480
|
+
"Encinar de los Reyes": "Valdefuentes",
|
|
481
|
+
"La Elipa": "Ventas",
|
|
482
|
+
"Las Cárcavas - San Antonio": "Valdefuentes",
|
|
483
|
+
"Lavapiés": "Embajadores",
|
|
484
|
+
"Montecarmelo": "El Goloso",
|
|
485
|
+
"Puerta de Hierro": "Ciudad Universitaria",
|
|
486
|
+
"Villaverde Alto, Casco Histórico de Villaverde": "San Andrés",
|
|
487
|
+
"Villaverde Bajo": "Los Rosales",
|
|
488
|
+
"Virgen del Cortijo": "Valdefuentes",
|
|
489
|
+
"Las Acacias": "Acacias",
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
# Cutoff score for rapidfuzz in the name standardization function
|
|
493
|
+
STANDARD_THRESHOLD = 40
|
datamarket/utils/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
from .main import *
|
|
1
|
+
from .main import * # noqa: F403
|
datamarket/utils/airflow.py
CHANGED
|
@@ -3,20 +3,23 @@
|
|
|
3
3
|
|
|
4
4
|
import re
|
|
5
5
|
import unicodedata
|
|
6
|
+
|
|
6
7
|
import inflection
|
|
7
8
|
|
|
8
9
|
########################################################################################################################
|
|
9
10
|
# FUNCTIONS
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
def process_task_name(task_id):
|
|
12
|
-
task_id =
|
|
13
|
-
f"_{unicodedata.name(c)}_" if not c.isalnum() else c
|
|
14
|
-
|
|
14
|
+
task_id = "".join(
|
|
15
|
+
f"_{unicodedata.name(c)}_" if not c.isalnum() else c
|
|
16
|
+
for c in task_id
|
|
17
|
+
if c.isalnum() or (unicodedata.category(c) not in ("Cc", "Cf", "Cs", "Co", "Cn"))
|
|
15
18
|
)
|
|
16
|
-
task_id = inflection.parameterize(task_id, separator=
|
|
19
|
+
task_id = inflection.parameterize(task_id, separator="_")
|
|
17
20
|
task_id = task_id.lower()
|
|
18
|
-
task_id = task_id.strip(
|
|
19
|
-
task_id = re.sub(r
|
|
21
|
+
task_id = task_id.strip("_")
|
|
22
|
+
task_id = re.sub(r"_+", "_", task_id)
|
|
20
23
|
if task_id[0].isdigit():
|
|
21
|
-
task_id =
|
|
24
|
+
task_id = "task_" + task_id
|
|
22
25
|
return task_id
|
datamarket/utils/alchemy.py
CHANGED
|
@@ -8,6 +8,7 @@ from sqlalchemy.ext.declarative import declarative_base
|
|
|
8
8
|
|
|
9
9
|
Base = declarative_base()
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
class View(Base):
|
|
12
13
|
__abstract__ = True
|
|
13
14
|
is_view = True
|
|
@@ -19,4 +20,4 @@ class View(Base):
|
|
|
19
20
|
"""
|
|
20
21
|
conn.execute(f"""
|
|
21
22
|
CREATE OR REPLACE VIEW {cls.__tablename__} AS {query}
|
|
22
|
-
""")
|
|
23
|
+
""")
|
datamarket/utils/main.py
CHANGED
|
@@ -9,8 +9,13 @@ import re
|
|
|
9
9
|
import shlex
|
|
10
10
|
import subprocess
|
|
11
11
|
import time
|
|
12
|
+
from datetime import timedelta
|
|
13
|
+
from typing import Sequence, overload
|
|
12
14
|
|
|
13
15
|
import pendulum
|
|
16
|
+
from babel.numbers import parse_decimal
|
|
17
|
+
|
|
18
|
+
from ..interfaces.proxy import ProxyInterface
|
|
14
19
|
|
|
15
20
|
########################################################################################################################
|
|
16
21
|
# FUNCTIONS
|
|
@@ -34,15 +39,63 @@ def set_logger(level):
|
|
|
34
39
|
log.addHandler(ch)
|
|
35
40
|
|
|
36
41
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
42
|
+
@overload
|
|
43
|
+
def ban_sleep(max_time: float) -> None: ...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@overload
|
|
47
|
+
def ban_sleep(min_time: float, max_time: float) -> None: ...
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def ban_sleep(x: float, y: float | None = None) -> None:
|
|
51
|
+
"""
|
|
52
|
+
Sleep for a random number of seconds.
|
|
53
|
+
|
|
54
|
+
Usage:
|
|
55
|
+
ban_sleep(5) -> sleeps ~N(5, 2.5²) seconds, truncated to >= 0
|
|
56
|
+
ban_sleep(3, 7) -> sleeps uniformly between 3 and 7 seconds
|
|
57
|
+
ban_sleep(7, 3) -> same as above (order doesn't matter)
|
|
58
|
+
"""
|
|
59
|
+
if y is None:
|
|
60
|
+
mean = float(x)
|
|
61
|
+
std_dev = mean / 2.0
|
|
62
|
+
sleep_time = random.gauss(mean, std_dev) # noqa: S311
|
|
63
|
+
sleep_time = max(0.0, sleep_time)
|
|
64
|
+
else:
|
|
65
|
+
x, y = sorted([float(x), float(y)])
|
|
66
|
+
sleep_time = random.uniform(x, y) # noqa: S311
|
|
67
|
+
|
|
68
|
+
logger.info(f"sleeping for {sleep_time:.2f} seconds...")
|
|
40
69
|
time.sleep(sleep_time)
|
|
41
70
|
|
|
42
71
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
72
|
+
@overload
|
|
73
|
+
async def ban_sleep_async(seconds: float) -> None: ...
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@overload
|
|
77
|
+
async def ban_sleep_async(min_time: float, max_time: float) -> None: ...
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def ban_sleep_async(min_time: float, max_time: float | None = None) -> None:
|
|
81
|
+
"""
|
|
82
|
+
Asynchronous sleep for a random number of seconds.
|
|
83
|
+
|
|
84
|
+
Usage:
|
|
85
|
+
await ban_sleep_async(5) # sleeps ~N(5, (5/2)²) seconds, truncated to >= 0
|
|
86
|
+
await ban_sleep_async(3, 7) # sleeps uniformly between 3 and 7 seconds
|
|
87
|
+
await ban_sleep_async(7, 3) # same as above (order doesn't matter)
|
|
88
|
+
"""
|
|
89
|
+
if max_time is None:
|
|
90
|
+
mean = float(min_time)
|
|
91
|
+
std_dev = mean / 2.0
|
|
92
|
+
sleep_time = random.gauss(mean, std_dev) # noqa: S311
|
|
93
|
+
sleep_time = max(0.0, sleep_time)
|
|
94
|
+
else:
|
|
95
|
+
min_time, max_time = sorted([float(min_time), float(max_time)])
|
|
96
|
+
sleep_time = random.uniform(min_time, max_time) # noqa: S311
|
|
97
|
+
|
|
98
|
+
logger.info(f"sleeping for {sleep_time:.2f} seconds...")
|
|
46
99
|
await asyncio.sleep(sleep_time)
|
|
47
100
|
|
|
48
101
|
|
|
@@ -74,6 +127,19 @@ def text_to_int(text):
|
|
|
74
127
|
return num
|
|
75
128
|
|
|
76
129
|
|
|
130
|
+
def text_to_float(text: str | None, locale: str = "es_ES") -> float | None:
|
|
131
|
+
if not text:
|
|
132
|
+
return None
|
|
133
|
+
match = re.search(r"\d(?:[\d\s.,]*\d)?", text)
|
|
134
|
+
if not match:
|
|
135
|
+
return None
|
|
136
|
+
number_str = match.group(0).replace(" ", "")
|
|
137
|
+
try:
|
|
138
|
+
return float(parse_decimal(number_str, locale=locale))
|
|
139
|
+
except Exception:
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
|
|
77
143
|
def sleep_out_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
|
|
78
144
|
while pendulum.now(tz=tz).hour >= to_h or pendulum.now(tz=tz).hour < from_h:
|
|
79
145
|
logger.warning("time to sleep and not scrape anything...")
|
|
@@ -99,3 +165,58 @@ def parse_field(dict_struct, field_path, format_method=None):
|
|
|
99
165
|
if field_value is None:
|
|
100
166
|
return None
|
|
101
167
|
return format_method(field_value) if format_method else field_value
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def get_data(
|
|
171
|
+
url: str,
|
|
172
|
+
method: str = "GET",
|
|
173
|
+
output: str = "json",
|
|
174
|
+
sleep: tuple = (6, 3),
|
|
175
|
+
proxy_interface: ProxyInterface = None,
|
|
176
|
+
use_auth_proxies: bool = False,
|
|
177
|
+
max_proxy_delay: timedelta = timedelta(minutes=10),
|
|
178
|
+
ignored_status_codes: Sequence[int] = (),
|
|
179
|
+
**kwargs,
|
|
180
|
+
):
|
|
181
|
+
"""
|
|
182
|
+
Fetches data from a given URL using HTTP requests, with support for proxy configuration, retries, and flexible output formats.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
url (str): The target URL to fetch data from.
|
|
186
|
+
method (str, optional): HTTP method to use (e.g., 'GET', 'POST'). Defaults to 'GET'.
|
|
187
|
+
output (str, optional): Output format ('json', 'text', 'soup', 'response'). Defaults to 'json'.
|
|
188
|
+
sleep (tuple, optional): Tuple specifying max and min sleep times (seconds) after request. Defaults to (6, 3).
|
|
189
|
+
use_auth_proxies (bool, optional): Whether to use authenticated proxies. Defaults to False.
|
|
190
|
+
max_proxy_delay (timedelta, optional): Maximum delay for proxy retry logic. Defaults to 10 minutes.
|
|
191
|
+
ignored_status_codes (Sequence[int], optional): Status codes to ignore and return response for. Defaults to ().
|
|
192
|
+
**kwargs: Additional arguments passed to the requests method (timeout defaults to 30 seconds if not specified).
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Depends on the 'output' argument:
|
|
196
|
+
- 'json': Parsed JSON response.
|
|
197
|
+
- 'text': Response text.
|
|
198
|
+
- 'soup': BeautifulSoup-parsed HTML.
|
|
199
|
+
- 'response': Raw requests.Response object.
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
IgnoredHTTPError: If a response status code is in `ignored_status_codes`.
|
|
203
|
+
NotFoundError: If a 404 or 410 status code is returned and not in `ignored_status_codes`.
|
|
204
|
+
BadRequestError: If a 400 status code is returned and not in `ignored_status_codes`.
|
|
205
|
+
EmptyResponseError: If the response has no content.
|
|
206
|
+
ProxyError: On proxy-related errors.
|
|
207
|
+
requests.HTTPError: For other HTTP errors if not ignored.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
from .requests import RequestsClient
|
|
211
|
+
|
|
212
|
+
client = RequestsClient(proxy_interface)
|
|
213
|
+
return client.get_data(
|
|
214
|
+
url=url,
|
|
215
|
+
method=method,
|
|
216
|
+
output=output,
|
|
217
|
+
sleep=sleep,
|
|
218
|
+
use_auth_proxies=use_auth_proxies,
|
|
219
|
+
max_proxy_delay=max_proxy_delay,
|
|
220
|
+
ignored_status_codes=ignored_status_codes,
|
|
221
|
+
**kwargs,
|
|
222
|
+
)
|