datamarket 0.7.41__py3-none-any.whl → 0.7.125__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,9 @@
1
+ import re
2
+
3
+ from unidecode import unidecode
4
+
5
+ CITY_TO_PROVINCE = {"Madrid": "Madrid"}
6
+
1
7
  POSTCODES = {
2
8
  "01": "Álava",
3
9
  "02": "Albacete",
@@ -52,3 +58,436 @@ POSTCODES = {
52
58
  "51": "Ceuta",
53
59
  "52": "Melilla",
54
60
  }
61
+
62
+ # Mapping of normalized names (for comparison) to standardized names (for storing)
63
+ # for each corresponding country code
64
+ STATES = {
65
+ "es": {
66
+ "andalucia": "Andalucía",
67
+ "aragon": "Aragón",
68
+ "asturias": "Asturias",
69
+ "baleares": "Baleares",
70
+ "canarias": "Canarias",
71
+ "cantabria": "Cantabria",
72
+ "castilla la mancha": "Castilla-La Mancha",
73
+ "castilla y leon": "Castilla y León",
74
+ "cataluna": "Cataluña",
75
+ "ceuta": "Ceuta",
76
+ "comunidad valenciana": "Comunidad Valenciana",
77
+ "extremadura": "Extremadura",
78
+ "galicia": "Galicia",
79
+ "la rioja": "La Rioja",
80
+ "madrid": "Comunidad de Madrid",
81
+ "melilla": "Melilla",
82
+ "murcia": "Murcia",
83
+ "navarra": "Navarra",
84
+ "pais vasco": "País Vasco",
85
+ "euskadi": "País Vasco", # Alias not caught by rapidfuzz
86
+ }
87
+ }
88
+
89
+ PROVINCES = {
90
+ "es": {
91
+ "alava": "Álava",
92
+ "araba": "Álava", # Alias not caught by rapidfuzz
93
+ "albacete": "Albacete",
94
+ "alicante": "Alicante",
95
+ "almeria": "Almería",
96
+ "asturias": "Asturias",
97
+ "avila": "Ávila",
98
+ "badajoz": "Badajoz",
99
+ "barcelona": "Barcelona",
100
+ "bizkaia": "Vizcaya",
101
+ "burgos": "Burgos",
102
+ "caceres": "Cáceres",
103
+ "cadiz": "Cádiz",
104
+ "cantabria": "Cantabria",
105
+ "castellon": "Castellón",
106
+ "ceuta": "Ceuta", # Considered province by opensm and/or geonames
107
+ "ciudad real": "Ciudad Real",
108
+ "cordoba": "Córdoba",
109
+ "cuenca": "Cuenca",
110
+ "gipuzkoa": "Gipuzkoa",
111
+ "gerona": "Gerona",
112
+ "granada": "Granada",
113
+ "guadalajara": "Guadalajara",
114
+ "huelva": "Huelva",
115
+ "huesca": "Huesca",
116
+ "islas baleares": "Islas Baleares",
117
+ "jaen": "Jaén",
118
+ "la coruna": "La Coruña",
119
+ "la rioja": "La Rioja",
120
+ "las palmas": "Las Palmas",
121
+ "leon": "León",
122
+ "lerida": "Lérida",
123
+ "lugo": "Lugo",
124
+ "madrid": "Madrid",
125
+ "malaga": "Málaga",
126
+ "melilla": "Melilla", # Considered province by opensm and/or geonames
127
+ "murcia": "Murcia",
128
+ "navarra": "Navarra",
129
+ "orense": "Orense",
130
+ "palencia": "Palencia",
131
+ "pontevedra": "Pontevedra",
132
+ "salamanca": "Salamanca",
133
+ "santa cruz de tenerife": "Santa Cruz de Tenerife",
134
+ "segovia": "Segovia",
135
+ "sevilla": "Sevilla",
136
+ "soria": "Soria",
137
+ "tarragona": "Tarragona",
138
+ "teruel": "Teruel",
139
+ "toledo": "Toledo",
140
+ "valencia": "Valencia",
141
+ "valladolid": "Valladolid",
142
+ "zamora": "Zamora",
143
+ "zaragoza": "Zaragoza",
144
+ }
145
+ }
146
+
147
+
148
+ PROVINCE_TO_POSTCODE = {
149
+ "es": {
150
+ "A Coruña": "15",
151
+ "Álava": "01",
152
+ "Araba": "01",
153
+ "Alacant": "03",
154
+ "Alicante": "03",
155
+ "Albacete": "02",
156
+ "Almería": "04",
157
+ "Asturias": "33",
158
+ "Ávila": "05",
159
+ "Badajoz": "06",
160
+ "Baleares": "07",
161
+ "Barcelona": "08",
162
+ "Bizkaia": "48",
163
+ "Burgos": "09",
164
+ "Cáceres": "10",
165
+ "Cádiz": "11",
166
+ "Cantabria": "39",
167
+ "Castelló": "12",
168
+ "Castellón": "12",
169
+ "Ceuta": "51",
170
+ "Ciudad Real": "13",
171
+ "Córdoba": "14",
172
+ "Cuenca": "16",
173
+ "Gerona": "17",
174
+ "Gipuzkoa": "20",
175
+ "Girona": "17",
176
+ "Granada": "18",
177
+ "Guadalajara": "19",
178
+ "Guipúzcoa": "20",
179
+ "Huelva": "21",
180
+ "Huesca": "22",
181
+ "Illes Balears": "07",
182
+ "Jaén": "23",
183
+ "La Coruña": "15",
184
+ "La Rioja": "26",
185
+ "Las Palmas": "35",
186
+ "León": "24",
187
+ "Lérida": "25",
188
+ "Lleida": "25",
189
+ "Lugo": "27",
190
+ "Madrid": "28",
191
+ "Málaga": "29",
192
+ "Melilla": "52",
193
+ "Murcia": "30",
194
+ "Navarra": "31",
195
+ "Orense": "32",
196
+ "Ourense": "32",
197
+ "Palencia": "34",
198
+ "Pontevedra": "36",
199
+ "Salamanca": "37",
200
+ "Santa Cruz de Tenerife": "38",
201
+ "Segovia": "40",
202
+ "Sevilla": "41",
203
+ "Soria": "42",
204
+ "Tarragona": "43",
205
+ "Teruel": "44",
206
+ "Toledo": "45",
207
+ "València": "46",
208
+ "Valencia": "46",
209
+ "Valladolid": "47",
210
+ "Vizcaya": "48",
211
+ "Zamora": "49",
212
+ "Zaragoza": "50",
213
+ },
214
+ "pt": {
215
+ "Aveiro": "3",
216
+ "Beja": "7",
217
+ "Braga": "4",
218
+ "Bragança": "5",
219
+ "Castelo Branco": "6",
220
+ "Coimbra": "3",
221
+ "Évora": "7",
222
+ "Faro": "8",
223
+ "Guarda": "6",
224
+ "Leiria": "2",
225
+ "Lisboa": "1",
226
+ "Portalegre": "7",
227
+ "Porto": "4",
228
+ "Santarém": "2",
229
+ "Setúbal": "2",
230
+ "Viana do Castelo": "4",
231
+ "Vila Real": "5",
232
+ "Viseu": "3",
233
+ "Açores": "9",
234
+ "Madeira": "9",
235
+ },
236
+ }
237
+
238
+
239
+ POSTCODE_TO_STATES = {
240
+ "es": {
241
+ # Andalucía
242
+ "04": "Andalucía",
243
+ "11": "Andalucía",
244
+ "14": "Andalucía",
245
+ "18": "Andalucía",
246
+ "21": "Andalucía",
247
+ "23": "Andalucía",
248
+ "29": "Andalucía",
249
+ "41": "Andalucía",
250
+ # Aragón
251
+ "22": "Aragón",
252
+ "44": "Aragón",
253
+ "50": "Aragón",
254
+ # Asturias
255
+ "33": "Principado de Asturias",
256
+ # Baleares
257
+ "07": "Islas Baleares",
258
+ # Canarias
259
+ "35": "Canarias",
260
+ "38": "Canarias",
261
+ # Cantabria
262
+ "39": "Cantabria",
263
+ # Castilla y León
264
+ "05": "Castilla y León",
265
+ "09": "Castilla y León",
266
+ "24": "Castilla y León",
267
+ "34": "Castilla y León",
268
+ "37": "Castilla y León",
269
+ "40": "Castilla y León",
270
+ "42": "Castilla y León",
271
+ "47": "Castilla y León",
272
+ "49": "Castilla y León",
273
+ # Castilla-La Mancha
274
+ "02": "Castilla-La Mancha",
275
+ "13": "Castilla-La Mancha",
276
+ "16": "Castilla-La Mancha",
277
+ "19": "Castilla-La Mancha",
278
+ "45": "Castilla-La Mancha",
279
+ # Cataluña
280
+ "08": "Cataluña",
281
+ "17": "Cataluña",
282
+ "25": "Cataluña",
283
+ "43": "Cataluña",
284
+ # Comunidad Valenciana
285
+ "03": "Comunidad Valenciana",
286
+ "12": "Comunidad Valenciana",
287
+ "46": "Comunidad Valenciana",
288
+ # Extremadura
289
+ "06": "Extremadura",
290
+ "10": "Extremadura",
291
+ # Galicia
292
+ "15": "Galicia",
293
+ "27": "Galicia",
294
+ "32": "Galicia",
295
+ "36": "Galicia",
296
+ # Madrid
297
+ "28": "Comunidad de Madrid",
298
+ # Murcia
299
+ "30": "Región de Murcia",
300
+ # Navarra
301
+ "31": "Comunidad Foral de Navarra",
302
+ # País Vasco
303
+ "01": "País Vasco",
304
+ "20": "País Vasco",
305
+ "48": "País Vasco",
306
+ # La Rioja
307
+ "26": "La Rioja",
308
+ # Ciudades Autónomas
309
+ "51": "Ceuta",
310
+ "52": "Melilla",
311
+ },
312
+ "pt": { # --- NORTE ---
313
+ "40": "Porto",
314
+ "41": "Porto",
315
+ "42": "Porto",
316
+ "43": "Porto",
317
+ "44": "Porto",
318
+ "45": "Aveiro", # Concelhos do norte de Aveiro, na fronteira com Porto.
319
+ "47": "Braga",
320
+ "48": "Braga", # Guimarães.
321
+ "49": "Viana do Castelo",
322
+ "50": "Vila Real",
323
+ "51": "Vila Real",
324
+ "52": "Vila Real",
325
+ "53": "Vila Real / Bragança", # Zona fronteiriça.
326
+ "54": "Bragança",
327
+ # --- CENTRO ---
328
+ "60": "Castelo Branco",
329
+ "61": "Castelo Branco",
330
+ "62": "Castelo Branco",
331
+ "63": "Guarda",
332
+ "30": "Coimbra",
333
+ "31": "Coimbra",
334
+ "32": "Coimbra",
335
+ "33": "Coimbra",
336
+ "34": "Viseu",
337
+ "35": "Viseu",
338
+ "37": "Aveiro",
339
+ "38": "Aveiro",
340
+ "24": "Leiria",
341
+ # --- ÁREA METROPOLITANA DE LISBOA e arredores ---
342
+ "10": "Lisboa",
343
+ "11": "Lisboa",
344
+ "12": "Lisboa",
345
+ "13": "Lisboa",
346
+ "14": "Lisboa",
347
+ "15": "Lisboa",
348
+ "16": "Lisboa",
349
+ "17": "Lisboa",
350
+ "18": "Lisboa",
351
+ "19": "Lisboa",
352
+ "20": "Santarém",
353
+ "21": "Santarém",
354
+ "22": "Santarém",
355
+ "23": "Santarém", # Tomar e Torres Novas.
356
+ "25": "Lisboa", # Concelhos como Torres Vedras, Mafra, Alenquer.
357
+ "26": "Lisboa", # Concelhos como Loures, Amadora, Odivelas.
358
+ "27": "Lisboa", # Concelhos como Sintra, Cascais, Oeiras.
359
+ "28": "Setúbal",
360
+ "29": "Setúbal",
361
+ # --- ALENTEJO ---
362
+ "70": "Évora",
363
+ "71": "Évora",
364
+ "72": "Évora",
365
+ "73": "Portalegre",
366
+ "74": "Portalegre",
367
+ "75": "Setúbal", # Litoral Alentejano (Sines, Grândola), administrativamente de Setúbal.
368
+ "76": "Beja",
369
+ "77": "Beja",
370
+ "78": "Beja",
371
+ "79": "Beja",
372
+ # --- ALGARVE ---
373
+ "80": "Faro",
374
+ "81": "Faro",
375
+ "82": "Faro",
376
+ "83": "Faro",
377
+ "84": "Faro",
378
+ "85": "Faro",
379
+ "86": "Faro",
380
+ "87": "Faro",
381
+ "88": "Faro",
382
+ "89": "Faro",
383
+ # --- REGIÕES AUTÓNOMAS ---
384
+ "90": "Madeira",
385
+ "91": "Madeira",
386
+ "92": "Madeira",
387
+ "93": "Madeira",
388
+ "95": "Açores", # Ilha de São Miguel (Ponta Delgada).
389
+ "96": "Açores", # Ilha de São Miguel (Ribeira Grande) e Santa Maria.
390
+ "97": "Açores", # Ilha Terceira (Angra do Heroísmo).
391
+ "98": "Açores", # Ilhas de São Jorge, Graciosa, Faial, Pico.
392
+ "99": "Açores", # Ilhas de Flores e Corvo.
393
+ },
394
+ }
395
+
396
+ _NORMALIZED_PROVINCE_CACHE = {}
397
+ for country, provinces in PROVINCE_TO_POSTCODE.items():
398
+ # Get the original keys (e.g., "A Coruña", "Álava")
399
+ original_keys = list(provinces.keys())
400
+
401
+ # Create the normalized list (e.g., "a coruna", "alava")
402
+ normalized_choices = [unidecode(p).lower() for p in original_keys]
403
+
404
+ _NORMALIZED_PROVINCE_CACHE[country] = {
405
+ "choices": normalized_choices, # The list for rapidfuzz to search in
406
+ "keys": original_keys, # The list to find the name by index
407
+ }
408
+
409
+ # Source: https://github.com/ariankoochak/regex-patterns-of-all-countries
410
+ COUNTRY_PARSING_RULES = {
411
+ "es": {
412
+ "zip_validate_pattern": re.compile(r"^\d{5}$"),
413
+ "zip_search_pattern": re.compile(r"\b\d{5}\b"),
414
+ "phone_validate_pattern": re.compile(r"^(\+?34)?[67]\d{8}$"),
415
+ },
416
+ "pt": {
417
+ "zip_validate_pattern": re.compile(r"^\d{4}[- ]{0,1}\d{3}$|^\d{4}$"),
418
+ "zip_search_pattern": re.compile(r"\b\d{4}[- ]?\d{3}\b|\b\d{4}\b"),
419
+ "phone_validate_pattern": re.compile(r"^(\+?351)?9[1236]\d{7}$"),
420
+ },
421
+ }
422
+
423
+ MADRID_DISTRICT_DIRECT_PATCH = {
424
+ # Correcciones directas
425
+ "Aravaca": "Moncloa-Aravaca",
426
+ "Puerta de Hierro": "Fuencarral-El Pardo",
427
+ "Palacio": "Centro",
428
+ "Argüelles": "Moncloa-Aravaca",
429
+ "Barrio de La Estación": "Latina",
430
+ "Casa de Campo": "Moncloa-Aravaca",
431
+ "Universidad": "Centro",
432
+ "Valdezarza": "Moncloa-Aravaca",
433
+ "Cortes": "Centro",
434
+ "Barrio de la Latina": "Centro",
435
+ "Ciudad Universitaria": "Moncloa-Aravaca",
436
+ "Embajadores": "Centro",
437
+ "Justicia": "Centro",
438
+ "Sol": "Centro",
439
+ "Barrio de los Austrias": "Centro",
440
+ }
441
+
442
+ MADRID_DISTRICT_QUARTER_PATCH = {
443
+ # Reglas dependientes del quarter
444
+ ("Centro", "Atocha"): "Arganzuela",
445
+ ("Centro", "Gaztambide"): "Chamberí",
446
+ ("Centro", "Imperial"): "Arganzuela",
447
+ ("Centro", "Palos de Moguer"): "Arganzuela",
448
+ ("Arganzuela", "Embajadores"): "Centro",
449
+ ("Salamanca", "La Elipa"): "Ciudad Lineal",
450
+ ("Salamanca", "Ventas"): "Ciudad Lineal",
451
+ ("Tetuán", "La Paz"): "Fuencarral-El Pardo",
452
+ ("Tetuán", "San Cristóbal"): "Villaverde",
453
+ ("Tetuán", "Colonia de San Cristóbal"): "Villaverde",
454
+ ("Tetuán", "Valdezarza"): "Moncloa-Aravaca",
455
+ ("Chamberí", "Ciudad Universitaria"): "Moncloa-Aravaca",
456
+ ("Chamberí", "Justicia"): "Centro",
457
+ ("Chamberí", "Universidad"): "Centro",
458
+ ("Fuencarral-El Pardo", "Castilla"): "Chamartín",
459
+ ("Fuencarral-El Pardo", "Valdeacederas"): "Tetuán",
460
+ ("Fuencarral-El Pardo", "Valdezarza"): "Moncloa-Aravaca",
461
+ ("Moncloa-Aravaca", "Bellas Vistas"): "Tetuán",
462
+ ("Moncloa-Aravaca", "Berruguete"): "Tetuán",
463
+ ("Moncloa-Aravaca", "Campamento"): "Latina",
464
+ ("Moncloa-Aravaca", "Gaztambide"): "Chamberí",
465
+ ("Moncloa-Aravaca", "Lucero"): "Latina",
466
+ ("Moncloa-Aravaca", "Valdeacederas"): "Tetuán",
467
+ ("Moncloa-Aravaca", "Vallehermoso"): "Chamberí",
468
+ ("Latina", "Casa de Campo"): "Moncloa-Aravaca",
469
+ ("Villaverde", "San Fermín"): "Usera",
470
+ ("San Blas - Canillejas", "Concepción"): "Ciudad Lineal",
471
+ ("San Blas - Canillejas", "Quintana"): "Ciudad Lineal",
472
+ ("Barajas", "Palomas"): "Hortaleza",
473
+ }
474
+
475
+ MADRID_QUARTER_DIRECT_PATCH = {
476
+ "Barrio de la Latina": "Palacio",
477
+ "Barrio de las Letras": "Cortes",
478
+ "Barrio de los Austrias": "Palacio",
479
+ "Colonia de San Cristóbal": "San Cristóbal",
480
+ "Encinar de los Reyes": "Valdefuentes",
481
+ "La Elipa": "Ventas",
482
+ "Las Cárcavas - San Antonio": "Valdefuentes",
483
+ "Lavapiés": "Embajadores",
484
+ "Montecarmelo": "El Goloso",
485
+ "Puerta de Hierro": "Ciudad Universitaria",
486
+ "Villaverde Alto, Casco Histórico de Villaverde": "San Andrés",
487
+ "Villaverde Bajo": "Los Rosales",
488
+ "Virgen del Cortijo": "Valdefuentes",
489
+ "Las Acacias": "Acacias",
490
+ }
491
+
492
+ # Cutoff score for rapidfuzz in the name standardization function
493
+ STANDARD_THRESHOLD = 40
@@ -1 +1 @@
1
- from .main import *
1
+ from .main import * # noqa: F403
@@ -3,20 +3,23 @@
3
3
 
4
4
  import re
5
5
  import unicodedata
6
+
6
7
  import inflection
7
8
 
8
9
  ########################################################################################################################
9
10
  # FUNCTIONS
10
11
 
12
+
11
13
  def process_task_name(task_id):
12
- task_id = ''.join(
13
- f"_{unicodedata.name(c)}_" if not c.isalnum() else c for c in task_id
14
- if c.isalnum() or (unicodedata.category(c) not in ('Cc', 'Cf', 'Cs', 'Co', 'Cn'))
14
+ task_id = "".join(
15
+ f"_{unicodedata.name(c)}_" if not c.isalnum() else c
16
+ for c in task_id
17
+ if c.isalnum() or (unicodedata.category(c) not in ("Cc", "Cf", "Cs", "Co", "Cn"))
15
18
  )
16
- task_id = inflection.parameterize(task_id, separator='_')
19
+ task_id = inflection.parameterize(task_id, separator="_")
17
20
  task_id = task_id.lower()
18
- task_id = task_id.strip('_')
19
- task_id = re.sub(r'_+', '_', task_id)
21
+ task_id = task_id.strip("_")
22
+ task_id = re.sub(r"_+", "_", task_id)
20
23
  if task_id[0].isdigit():
21
- task_id = 'task_' + task_id
24
+ task_id = "task_" + task_id
22
25
  return task_id
@@ -8,6 +8,7 @@ from sqlalchemy.ext.declarative import declarative_base
8
8
 
9
9
  Base = declarative_base()
10
10
 
11
+
11
12
  class View(Base):
12
13
  __abstract__ = True
13
14
  is_view = True
@@ -19,4 +20,4 @@ class View(Base):
19
20
  """
20
21
  conn.execute(f"""
21
22
  CREATE OR REPLACE VIEW {cls.__tablename__} AS {query}
22
- """)
23
+ """)
datamarket/utils/main.py CHANGED
@@ -9,8 +9,13 @@ import re
9
9
  import shlex
10
10
  import subprocess
11
11
  import time
12
+ from datetime import timedelta
13
+ from typing import Sequence, overload
12
14
 
13
15
  import pendulum
16
+ from babel.numbers import parse_decimal
17
+
18
+ from ..interfaces.proxy import ProxyInterface
14
19
 
15
20
  ########################################################################################################################
16
21
  # FUNCTIONS
@@ -34,15 +39,63 @@ def set_logger(level):
34
39
  log.addHandler(ch)
35
40
 
36
41
 
37
- def ban_sleep(max_time, min_time=0):
38
- sleep_time = int(random.uniform(min_time, max_time))
39
- logger.info(f"sleeping for {sleep_time} seconds...")
42
+ @overload
43
+ def ban_sleep(max_time: float) -> None: ...
44
+
45
+
46
+ @overload
47
+ def ban_sleep(min_time: float, max_time: float) -> None: ...
48
+
49
+
50
+ def ban_sleep(x: float, y: float | None = None) -> None:
51
+ """
52
+ Sleep for a random number of seconds.
53
+
54
+ Usage:
55
+ ban_sleep(5) -> sleeps ~N(5, 2.5²) seconds, truncated to >= 0
56
+ ban_sleep(3, 7) -> sleeps uniformly between 3 and 7 seconds
57
+ ban_sleep(7, 3) -> same as above (order doesn't matter)
58
+ """
59
+ if y is None:
60
+ mean = float(x)
61
+ std_dev = mean / 2.0
62
+ sleep_time = random.gauss(mean, std_dev) # noqa: S311
63
+ sleep_time = max(0.0, sleep_time)
64
+ else:
65
+ x, y = sorted([float(x), float(y)])
66
+ sleep_time = random.uniform(x, y) # noqa: S311
67
+
68
+ logger.info(f"sleeping for {sleep_time:.2f} seconds...")
40
69
  time.sleep(sleep_time)
41
70
 
42
71
 
43
- async def ban_sleep_async(max_time, min_time=0):
44
- sleep_time = int(random.uniform(min_time, max_time)) # noqa: S311
45
- logger.info(f"sleeping for {sleep_time} seconds...")
72
+ @overload
73
+ async def ban_sleep_async(seconds: float) -> None: ...
74
+
75
+
76
+ @overload
77
+ async def ban_sleep_async(min_time: float, max_time: float) -> None: ...
78
+
79
+
80
+ async def ban_sleep_async(min_time: float, max_time: float | None = None) -> None:
81
+ """
82
+ Asynchronous sleep for a random number of seconds.
83
+
84
+ Usage:
85
+ await ban_sleep_async(5) # sleeps ~N(5, (5/2)²) seconds, truncated to >= 0
86
+ await ban_sleep_async(3, 7) # sleeps uniformly between 3 and 7 seconds
87
+ await ban_sleep_async(7, 3) # same as above (order doesn't matter)
88
+ """
89
+ if max_time is None:
90
+ mean = float(min_time)
91
+ std_dev = mean / 2.0
92
+ sleep_time = random.gauss(mean, std_dev) # noqa: S311
93
+ sleep_time = max(0.0, sleep_time)
94
+ else:
95
+ min_time, max_time = sorted([float(min_time), float(max_time)])
96
+ sleep_time = random.uniform(min_time, max_time) # noqa: S311
97
+
98
+ logger.info(f"sleeping for {sleep_time:.2f} seconds...")
46
99
  await asyncio.sleep(sleep_time)
47
100
 
48
101
 
@@ -74,6 +127,19 @@ def text_to_int(text):
74
127
  return num
75
128
 
76
129
 
130
+ def text_to_float(text: str | None, locale: str = "es_ES") -> float | None:
131
+ if not text:
132
+ return None
133
+ match = re.search(r"\d(?:[\d\s.,]*\d)?", text)
134
+ if not match:
135
+ return None
136
+ number_str = match.group(0).replace(" ", "")
137
+ try:
138
+ return float(parse_decimal(number_str, locale=locale))
139
+ except Exception:
140
+ return None
141
+
142
+
77
143
  def sleep_out_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
78
144
  while pendulum.now(tz=tz).hour >= to_h or pendulum.now(tz=tz).hour < from_h:
79
145
  logger.warning("time to sleep and not scrape anything...")
@@ -99,3 +165,58 @@ def parse_field(dict_struct, field_path, format_method=None):
99
165
  if field_value is None:
100
166
  return None
101
167
  return format_method(field_value) if format_method else field_value
168
+
169
+
170
+ def get_data(
171
+ url: str,
172
+ method: str = "GET",
173
+ output: str = "json",
174
+ sleep: tuple = (6, 3),
175
+ proxy_interface: ProxyInterface = None,
176
+ use_auth_proxies: bool = False,
177
+ max_proxy_delay: timedelta = timedelta(minutes=10),
178
+ ignored_status_codes: Sequence[int] = (),
179
+ **kwargs,
180
+ ):
181
+ """
182
+ Fetches data from a given URL using HTTP requests, with support for proxy configuration, retries, and flexible output formats.
183
+
184
+ Args:
185
+ url (str): The target URL to fetch data from.
186
+ method (str, optional): HTTP method to use (e.g., 'GET', 'POST'). Defaults to 'GET'.
187
+ output (str, optional): Output format ('json', 'text', 'soup', 'response'). Defaults to 'json'.
188
+ sleep (tuple, optional): Tuple specifying max and min sleep times (seconds) after request. Defaults to (6, 3).
189
+ use_auth_proxies (bool, optional): Whether to use authenticated proxies. Defaults to False.
190
+ max_proxy_delay (timedelta, optional): Maximum delay for proxy retry logic. Defaults to 10 minutes.
191
+ ignored_status_codes (Sequence[int], optional): Status codes to ignore and return response for. Defaults to ().
192
+ **kwargs: Additional arguments passed to the requests method (timeout defaults to 30 seconds if not specified).
193
+
194
+ Returns:
195
+ Depends on the 'output' argument:
196
+ - 'json': Parsed JSON response.
197
+ - 'text': Response text.
198
+ - 'soup': BeautifulSoup-parsed HTML.
199
+ - 'response': Raw requests.Response object.
200
+
201
+ Raises:
202
+ IgnoredHTTPError: If a response status code is in `ignored_status_codes`.
203
+ NotFoundError: If a 404 or 410 status code is returned and not in `ignored_status_codes`.
204
+ BadRequestError: If a 400 status code is returned and not in `ignored_status_codes`.
205
+ EmptyResponseError: If the response has no content.
206
+ ProxyError: On proxy-related errors.
207
+ requests.HTTPError: For other HTTP errors if not ignored.
208
+ """
209
+
210
+ from .requests import RequestsClient
211
+
212
+ client = RequestsClient(proxy_interface)
213
+ return client.get_data(
214
+ url=url,
215
+ method=method,
216
+ output=output,
217
+ sleep=sleep,
218
+ use_auth_proxies=use_auth_proxies,
219
+ max_proxy_delay=max_proxy_delay,
220
+ ignored_status_codes=ignored_status_codes,
221
+ **kwargs,
222
+ )