datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (38) hide show
  1. datamarket/__init__.py +0 -1
  2. datamarket/exceptions/__init__.py +1 -0
  3. datamarket/exceptions/main.py +118 -0
  4. datamarket/interfaces/alchemy.py +1934 -25
  5. datamarket/interfaces/aws.py +81 -14
  6. datamarket/interfaces/azure.py +127 -0
  7. datamarket/interfaces/drive.py +60 -10
  8. datamarket/interfaces/ftp.py +37 -14
  9. datamarket/interfaces/llm.py +1220 -0
  10. datamarket/interfaces/nominatim.py +314 -42
  11. datamarket/interfaces/peerdb.py +272 -104
  12. datamarket/interfaces/proxy.py +354 -50
  13. datamarket/interfaces/tinybird.py +7 -15
  14. datamarket/params/nominatim.py +439 -0
  15. datamarket/utils/__init__.py +1 -1
  16. datamarket/utils/airflow.py +10 -7
  17. datamarket/utils/alchemy.py +2 -1
  18. datamarket/utils/logs.py +88 -0
  19. datamarket/utils/main.py +138 -10
  20. datamarket/utils/nominatim.py +201 -0
  21. datamarket/utils/playwright/__init__.py +0 -0
  22. datamarket/utils/playwright/async_api.py +274 -0
  23. datamarket/utils/playwright/sync_api.py +281 -0
  24. datamarket/utils/requests.py +655 -0
  25. datamarket/utils/selenium.py +6 -12
  26. datamarket/utils/strings/__init__.py +1 -0
  27. datamarket/utils/strings/normalization.py +217 -0
  28. datamarket/utils/strings/obfuscation.py +153 -0
  29. datamarket/utils/strings/standardization.py +40 -0
  30. datamarket/utils/typer.py +2 -1
  31. datamarket/utils/types.py +1 -0
  32. datamarket-0.10.3.dist-info/METADATA +172 -0
  33. datamarket-0.10.3.dist-info/RECORD +38 -0
  34. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
  35. datamarket-0.6.0.dist-info/METADATA +0 -49
  36. datamarket-0.6.0.dist-info/RECORD +0 -24
  37. datamarket-0.6.0.dist-info/top_level.txt +0 -1
  38. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,111 +1,383 @@
1
1
  ########################################################################################################################
2
2
  # IMPORTS
3
3
 
4
+ import gettext
4
5
  import logging
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
7
 
8
+ import pycountry
6
9
  import requests
10
+ from geopy.distance import geodesic
11
+ from jellyfish import jaro_winkler_similarity
7
12
 
8
- from ..params.nominatim import POSTCODES
13
+ from ..params.nominatim import (
14
+ CITY_TO_PROVINCE,
15
+ MADRID_DISTRICT_DIRECT_PATCH,
16
+ MADRID_DISTRICT_QUARTER_PATCH,
17
+ MADRID_QUARTER_DIRECT_PATCH,
18
+ POSTCODES,
19
+ )
20
+ from ..utils.nominatim import standardize_admin_division
21
+ from ..utils.strings import normalize
22
+
23
+ ########################################################################################################################
24
+ # PARAMETERS
25
+
26
+ JARO_WINKLER_THRESHOLD = 0.85
27
+ CLOSE_KM = 2.0
9
28
 
10
29
  ########################################################################################################################
11
30
  # CLASSES
12
31
 
13
32
  logger = logging.getLogger(__name__)
33
+ spanish = gettext.translation("iso3166-1", pycountry.LOCALES_DIR, languages=["es"])
34
+ spanish.install()
14
35
 
15
36
 
16
37
  class GeoNames:
17
- def __init__(self, endpoint):
38
+ def __init__(self, endpoint: str) -> None:
18
39
  self.endpoint = endpoint
19
40
 
20
41
  @staticmethod
21
- def validate_postcode(postcode):
42
+ def validate_postcode(postcode: Union[int, str]) -> Optional[str]:
22
43
  if isinstance(postcode, int):
23
44
  postcode = str(postcode)
24
45
 
25
46
  if postcode and len(postcode) == 5 and postcode[:2] in POSTCODES:
26
47
  return postcode
27
-
48
+
28
49
  if postcode and len(postcode) == 4:
29
50
  postcode = f"0{postcode}"
30
51
  if postcode[:2] in POSTCODES:
31
52
  return postcode
32
53
 
33
54
  @staticmethod
34
- def get_province_from_postcode(postcode):
55
+ def get_province_from_postcode(postcode: Optional[str]) -> Optional[str]:
35
56
  if postcode:
36
57
  return POSTCODES[postcode[:2]]
37
58
 
38
- def reverse(self, lat, lon):
39
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}").json()
59
+ def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
60
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}", timeout=30).json()
40
61
 
41
62
 
42
63
  class Nominatim:
43
- def __init__(self, nominatim_endpoint, geonames_endpoint):
64
+ def __init__(self, nominatim_endpoint: str, geonames_endpoint: str) -> None:
44
65
  self.endpoint = nominatim_endpoint
45
66
  self.geonames = GeoNames(geonames_endpoint)
46
67
 
47
68
  @staticmethod
48
- def get_attribute(raw_json, keys):
69
+ def _get_attribute(raw_json: Dict[str, Any], keys: List[str]) -> Any:
49
70
  for key in keys:
50
71
  if key in raw_json:
51
72
  return raw_json[key]
52
73
 
53
- def geocode(self, address):
54
- return requests.get(f"{self.endpoint}/?q={address}&format=json").json()
74
+ def _calculate_distance(
75
+ self, lat_str: Optional[str], lon_str: Optional[str], input_coords: Tuple[float, float]
76
+ ) -> float:
77
+ dist = float("inf")
78
+ if lat_str and lon_str:
79
+ try:
80
+ coords = (float(lat_str), float(lon_str))
81
+ dist = geodesic(input_coords, coords).km
82
+ except (ValueError, TypeError):
83
+ logger.warning("Invalid coordinates for distance calculation.")
84
+ return dist
55
85
 
56
- def geocode_parsed(self, address):
57
- results = self.geocode(address)
86
+ def _parse_nominatim_result(self, nominatim_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
87
+ raw_address = nominatim_raw_json.get("address", {})
58
88
 
59
- if results:
60
- return self.reverse_parsed(results[0]["lat"], results[0]["lon"])
89
+ postcode_str = str(raw_address.get("postcode", ""))
90
+ postcode = self.geonames.validate_postcode(postcode_str)
61
91
 
62
- def reverse(self, lat, lon):
63
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json").json()
92
+ city = self._get_attribute(raw_address, ["city", "town", "village"])
93
+ district, quarter = self._get_district_quarter(raw_address)
64
94
 
65
- def reverse_parsed(self, lat, lon):
66
- raw_json = self.reverse(lat, lon).get("address", {})
67
- geoname = self.geonames.reverse(lat, lon)
95
+ return {
96
+ "country": raw_address.get("country"),
97
+ "country_code": (raw_address.get("country_code") or "").lower(),
98
+ "state": raw_address.get("state"),
99
+ "province": raw_address.get("province") or CITY_TO_PROVINCE.get(city),
100
+ "city": city,
101
+ "postcode": postcode,
102
+ "district": district,
103
+ "quarter": quarter,
104
+ "street": raw_address.get("road"),
105
+ "number": raw_address.get("house_number"),
106
+ }
68
107
 
69
- postcode = self.geonames.validate_postcode(
70
- str(geoname.get("postal_code", ""))
71
- ) or self.geonames.validate_postcode(str(raw_json.get("postcode")))
108
+ def _parse_geonames_result(self, geonames_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
109
+ geonames_country_code_str = geonames_raw_json.get("country_code")
110
+ country_name = None
111
+ if geonames_country_code_str:
112
+ try:
113
+ country_obj = pycountry.countries.get(alpha_2=geonames_country_code_str.upper())
114
+ if country_obj:
115
+ country_name = spanish.gettext(country_obj.name)
116
+ except LookupError:
117
+ logger.warning(f"Country name not found for code: {geonames_country_code_str} using pycountry.")
72
118
 
73
- city = self.get_attribute(raw_json, ["city", "town", "village"]) or geoname.get("place_name")
119
+ postcode_str = str(geonames_raw_json.get("postal_code", ""))
120
+ postcode = self.geonames.validate_postcode(postcode_str)
121
+ province = self.geonames.get_province_from_postcode(postcode) if postcode else None
122
+ city = geonames_raw_json.get("place_name")
74
123
 
75
- district, quarter = self.get_district_quarter(raw_json)
76
124
  return {
77
- "country": raw_json.get("country"),
78
- "country_code": raw_json.get("country_code") or geoname.get("country_code"),
79
- "state": raw_json.get("state") or geoname.get("community"),
80
- "province": self.geonames.get_province_from_postcode(postcode),
125
+ "country": country_name,
126
+ "country_code": (geonames_country_code_str or "").lower(),
127
+ "state": geonames_raw_json.get("community"),
128
+ "province": province,
81
129
  "city": city,
82
130
  "postcode": postcode,
83
- "district": district,
84
- "quarter": quarter,
85
- "street": raw_json.get("road"),
86
- "number": raw_json.get("house_number"),
131
+ "district": None,
132
+ "quarter": None,
133
+ "street": None,
134
+ "number": None,
87
135
  }
88
136
 
89
- def get_district_quarter(self, raw_json):
90
- district = self.get_attribute(raw_json, ["city_district", "suburb", "borough"])
91
- quarter = self.get_attribute(raw_json, ["quarter", "neighbourhood"])
137
+ def _get_empty_address_result(self) -> Dict[str, None]:
138
+ return {
139
+ "country": None,
140
+ "country_code": None,
141
+ "state": None,
142
+ "province": None,
143
+ "city": None,
144
+ "postcode": None,
145
+ "district": None,
146
+ "quarter": None,
147
+ "street": None,
148
+ "number": None,
149
+ }
150
+
151
+ def _select_postcode_and_derived_province(
152
+ self,
153
+ parsed_nominatim_result: Dict[str, Optional[str]],
154
+ parsed_geonames_result: Dict[str, Optional[str]],
155
+ nominatim_address_province_raw: Optional[str],
156
+ dist_nominatim: float, # distance Nominatim ↔ input (km)
157
+ dist_geonames: float, # distance GeoNames ↔ input (km)
158
+ ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
159
+ """
160
+ Decide the authoritative postcode, the province derived from it and the associated state.
161
+
162
+ Strategy:
163
+ 1. Derive province from each postcode.
164
+ 2. Validate each postcode–province pair:
165
+ • Nominatim: compare with raw province string (if present).
166
+ • GeoNames: multi-step validation (raw province, then Nominatim-derived
167
+ province when Nominatim coords are close, then distance fallback).
168
+ 3. Return the postcode/province that passes validation with precedence:
169
+ Nominatim > GeoNames. Returns (None, None, None) if neither passes.
170
+ """
171
+
172
+ # --- Extract postcodes ---
173
+ nominatim_postcode = parsed_nominatim_result.get("postcode")
174
+ geonames_postcode = parsed_geonames_result.get("postcode")
175
+
176
+ # --- Province derived from each postcode ---
177
+ province_from_nominatim_pc = self.geonames.get_province_from_postcode(nominatim_postcode)
178
+ province_from_geonames_pc = self.geonames.get_province_from_postcode(geonames_postcode)
179
+
180
+ # --- Normalised strings for similarity comparisons ---
181
+ norm_raw_province = normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
182
+ norm_province_from_nominatim_pc = normalize(province_from_nominatim_pc) if province_from_nominatim_pc else ""
183
+ norm_province_from_geonames_pc = normalize(province_from_geonames_pc) if province_from_geonames_pc else ""
184
+
185
+ # --- Distance heuristics ---
186
+ nominatim_is_close = dist_nominatim < CLOSE_KM
187
+ geonames_is_close = dist_geonames < CLOSE_KM
188
+
189
+ # --- Validate Nominatim postcode ---
190
+ nominatim_pc_valid = False
191
+ if norm_province_from_nominatim_pc and norm_raw_province:
192
+ nominatim_pc_valid = (
193
+ jaro_winkler_similarity(norm_province_from_nominatim_pc, norm_raw_province) > JARO_WINKLER_THRESHOLD
194
+ )
195
+
196
+ # --- Validate GeoNames postcode ---
197
+ geonames_pc_valid = False
198
+
199
+ # 1) Compare with raw province string (if exists)
200
+ if norm_province_from_geonames_pc and norm_raw_province:
201
+ geonames_pc_valid = (
202
+ jaro_winkler_similarity(norm_province_from_geonames_pc, norm_raw_province) > JARO_WINKLER_THRESHOLD
203
+ )
204
+
205
+ # 2) If no raw province, compare with province from Nominatim PC **only when** Nominatim is close
206
+ if not geonames_pc_valid and not norm_raw_province and nominatim_is_close: # noqa: SIM102
207
+ if norm_province_from_geonames_pc and norm_province_from_nominatim_pc:
208
+ geonames_pc_valid = (
209
+ jaro_winkler_similarity(norm_province_from_geonames_pc, norm_province_from_nominatim_pc)
210
+ > JARO_WINKLER_THRESHOLD
211
+ )
212
+
213
+ # 3) Fallback: accept GeoNames PC if its coordinates are very close
214
+ if not geonames_pc_valid and geonames_is_close and geonames_postcode:
215
+ geonames_pc_valid = True
216
+
217
+ # --- Select authoritative tuple ---
218
+ postcode = None
219
+ province = None
220
+ state = None
221
+
222
+ if nominatim_pc_valid:
223
+ postcode = nominatim_postcode
224
+ province = province_from_nominatim_pc
225
+ state = parsed_nominatim_result.get("state")
226
+ if not state and geonames_pc_valid:
227
+ state = parsed_geonames_result.get("state")
228
+ elif geonames_pc_valid:
229
+ postcode = geonames_postcode
230
+ province = province_from_geonames_pc
231
+ state = parsed_geonames_result.get("state")
232
+ if not state and nominatim_pc_valid:
233
+ state = parsed_nominatim_result.get("state")
92
234
 
93
- if not district and quarter:
94
- district = quarter
95
- quarter = None
235
+ return postcode, province, state
96
236
 
237
+ def _select_final_result(
238
+ self,
239
+ parsed_nominatim_result: Dict[str, Optional[str]],
240
+ parsed_geonames_result: Dict[str, Optional[str]],
241
+ dist_nominatim: float,
242
+ dist_geonames: float,
243
+ authoritative_postcode: Optional[str],
244
+ authoritative_province_from_postcode: Optional[str],
245
+ authoritative_state: Optional[str],
246
+ ) -> Dict[str, Optional[str]]:
247
+ """
248
+ Choose the address block (Nominatim vs GeoNames) based on distance,
249
+ then apply the authoritative postcode/province.
250
+
251
+ Rules:
252
+ • Pick the source with the smaller finite distance.
253
+ • Always overwrite 'postcode' if authoritative_postcode is present.
254
+ • Overwrite 'province' only when authoritative_province_from_postcode is not None.
255
+ • If both distances are ∞, return an empty address.
256
+ """
257
+
258
+ # ------------------------------------------------------------------ #
259
+ # 1. Decide the base address block #
260
+ # ------------------------------------------------------------------ #
261
+ if dist_nominatim <= dist_geonames and dist_nominatim != float("inf"):
262
+ final_result = parsed_nominatim_result
263
+ elif dist_geonames < dist_nominatim and dist_geonames != float("inf"):
264
+ final_result = parsed_geonames_result
265
+ else:
266
+ return self._get_empty_address_result()
267
+
268
+ # ------------------------------------------------------------------ #
269
+ # 2. Apply authoritative postcode / province #
270
+ # ------------------------------------------------------------------ #
271
+ if authoritative_postcode:
272
+ final_result["postcode"] = authoritative_postcode
273
+
274
+ if authoritative_province_from_postcode:
275
+ final_result["province"] = authoritative_province_from_postcode
276
+
277
+ if authoritative_province_from_postcode:
278
+ final_result["state"] = authoritative_state
279
+
280
+ return final_result
281
+
282
+ @staticmethod
283
+ def _patch_district(raw_district: str, raw_quarter: str = None):
284
+ """
285
+ Patches the district name, optionally using the quarter for specific patches.
286
+ """
287
+ if raw_quarter:
288
+ # If raw_quarter is provided, use the tuple (district, quarter) as the key.
289
+ key = (raw_district, raw_quarter)
290
+ return MADRID_DISTRICT_QUARTER_PATCH.get(key, raw_district)
291
+ else:
292
+ return MADRID_DISTRICT_DIRECT_PATCH.get(raw_district, raw_district)
293
+
294
+ @staticmethod
295
+ def _patch_quarter(raw_quarter: str):
296
+ """
297
+ Patches the quarter name directly.
298
+ """
299
+ return MADRID_QUARTER_DIRECT_PATCH.get(raw_quarter, raw_quarter)
300
+
301
+ def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
302
+ district = self._get_attribute(raw_json, ["city_district", "suburb", "borough"])
303
+ quarter = self._get_attribute(raw_json, ["quarter", "neighbourhood"])
304
+ if (city := raw_json.get("city")) and city == "Madrid":
305
+ mid_district = self._patch_district(district)
306
+ quarter = self._patch_quarter(quarter)
307
+ district = self._patch_district(mid_district, quarter)
97
308
  return district, quarter
98
309
 
310
+ def geocode(self, address: str) -> List[Dict[str, Any]]:
311
+ return requests.get(f"{self.endpoint}/search?q={address}&format=json", timeout=30).json()
312
+
313
+ def geocode_parsed(self, address: str) -> Optional[Dict[str, Optional[str]]]:
314
+ results = self.geocode(address)
315
+
316
+ if results:
317
+ return self.reverse_parsed(results[0]["lat"], results[0]["lon"])
318
+
319
+ def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
320
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json", timeout=30).json()
321
+
322
+ def reverse_parsed(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Optional[str]]:
323
+ nominatim_response = self.reverse(lat, lon)
324
+ geonames_response = self.geonames.reverse(lat, lon)
325
+
326
+ # Initial parsing
327
+ parsed_nominatim_result = self._parse_nominatim_result(nominatim_response)
328
+ parsed_geonames_result = self._parse_geonames_result(geonames_response)
329
+
330
+ # Calculate distances
331
+ nominatim_response_lat = nominatim_response.get("lat")
332
+ nominatim_response_lon = nominatim_response.get("lon")
333
+ geonames_response_lat = geonames_response.get("lat")
334
+ geonames_response_lon = geonames_response.get("lon")
335
+
336
+ input_coords = None
337
+ try:
338
+ input_coords = (float(lat), float(lon))
339
+ except (ValueError, TypeError):
340
+ logger.error(f"Invalid input coordinates for distance calculation: lat={lat}, lon={lon}")
341
+ return self._get_empty_address_result()
342
+
343
+ dist_nominatim = self._calculate_distance(nominatim_response_lat, nominatim_response_lon, input_coords)
344
+ dist_geonames = self._calculate_distance(geonames_response_lat, geonames_response_lon, input_coords)
345
+
346
+ # Determine authoritative postcode
347
+ nominatim_province = parsed_nominatim_result.get("province")
348
+ selected_postcode, selected_province_from_postcode, selected_state = self._select_postcode_and_derived_province(
349
+ parsed_nominatim_result, parsed_geonames_result, nominatim_province, dist_nominatim, dist_geonames
350
+ )
351
+
352
+ # Select final result
353
+ final_result = self._select_final_result(
354
+ parsed_nominatim_result,
355
+ parsed_geonames_result,
356
+ dist_nominatim,
357
+ dist_geonames,
358
+ selected_postcode,
359
+ selected_province_from_postcode,
360
+ selected_state,
361
+ )
362
+
363
+ # Standardize
364
+ final_result["province"] = standardize_admin_division(
365
+ name=final_result["province"], level="province", country_code=final_result["country_code"]
366
+ )
367
+ final_result["state"] = standardize_admin_division(
368
+ name=final_result["state"], level="state", country_code=final_result["country_code"]
369
+ )
370
+ return final_result
371
+
99
372
 
100
373
  class NominatimInterface(Nominatim):
101
- def __init__(self, config):
374
+ def __init__(self, config: Dict[str, Any]) -> None:
102
375
  if "osm" in config:
103
376
  self.config = config["osm"]
104
377
 
105
378
  self.nominatim_endpoint = self.config["nominatim_endpoint"]
106
379
  self.geonames_endpoint = self.config["geonames_endpoint"]
107
380
 
381
+ super().__init__(self.nominatim_endpoint, self.geonames_endpoint)
108
382
  else:
109
383
  logger.warning("no osm section in config")
110
-
111
- super().__init__(self.nominatim_endpoint, self.geonames_endpoint)