datamarket 0.9.28__py3-none-any.whl → 0.9.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -3,12 +3,20 @@
3
3
 
4
4
  import gettext
5
5
  import logging
6
- import pycountry
7
- from geopy.distance import geodesic
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
8
7
 
8
+ import pycountry
9
9
  import requests
10
+ from geopy.distance import geodesic
11
+ from jellyfish import jaro_winkler_similarity
10
12
 
11
13
  from ..params.nominatim import POSTCODES
14
+ from ..utils.strings import normalize
15
+
16
+ ########################################################################################################################
17
+ # PARAMETERS
18
+
19
+ JARO_WINKLER_THRESHOLD = 0.85
12
20
 
13
21
  ########################################################################################################################
14
22
  # CLASSES
@@ -19,11 +27,11 @@ spanish.install()
19
27
 
20
28
 
21
29
  class GeoNames:
22
- def __init__(self, endpoint):
30
+ def __init__(self, endpoint: str) -> None:
23
31
  self.endpoint = endpoint
24
32
 
25
33
  @staticmethod
26
- def validate_postcode(postcode):
34
+ def validate_postcode(postcode: Union[int, str]) -> Optional[str]:
27
35
  if isinstance(postcode, int):
28
36
  postcode = str(postcode)
29
37
 
@@ -36,148 +44,245 @@ class GeoNames:
36
44
  return postcode
37
45
 
38
46
  @staticmethod
39
- def get_province_from_postcode(postcode):
47
+ def get_province_from_postcode(postcode: Optional[str]) -> Optional[str]:
40
48
  if postcode:
41
49
  return POSTCODES[postcode[:2]]
42
50
 
43
- def reverse(self, lat, lon):
44
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}").json()
51
+ def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
52
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}", timeout=30).json()
45
53
 
46
54
 
47
55
  class Nominatim:
48
- def __init__(self, nominatim_endpoint, geonames_endpoint):
56
+ def __init__(self, nominatim_endpoint: str, geonames_endpoint: str) -> None:
49
57
  self.endpoint = nominatim_endpoint
50
58
  self.geonames = GeoNames(geonames_endpoint)
51
59
 
52
60
  @staticmethod
53
- def get_attribute(raw_json, keys):
61
+ def _get_attribute(raw_json: Dict[str, Any], keys: List[str]) -> Any:
54
62
  for key in keys:
55
63
  if key in raw_json:
56
64
  return raw_json[key]
57
65
 
58
- def geocode(self, address):
59
- return requests.get(f"{self.endpoint}/search?q={address}&format=json").json()
66
+ def _calculate_distance(self, lat_str: Optional[str], lon_str: Optional[str], input_coords: Tuple[float, float]) -> float:
67
+ dist = float("inf")
68
+ if lat_str and lon_str:
69
+ try:
70
+ coords = (float(lat_str), float(lon_str))
71
+ dist = geodesic(input_coords, coords).km
72
+ except (ValueError, TypeError):
73
+ logger.warning("Invalid coordinates for distance calculation.")
74
+ return dist
75
+
76
+ def _parse_nominatim_result(self, nominatim_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
77
+ raw_address = nominatim_raw_json.get("address", {})
78
+
79
+ postcode_str = str(raw_address.get("postcode", ""))
80
+ postcode = self.geonames.validate_postcode(postcode_str)
81
+
82
+ city = self._get_attribute(raw_address, ["city", "town", "village"])
83
+ district, quarter = self._get_district_quarter(raw_address)
84
+
85
+ return {
86
+ "country": raw_address.get("country"),
87
+ "country_code": (raw_address.get("country_code") or "").lower(),
88
+ "state": raw_address.get("state"),
89
+ "province": raw_address.get("province"),
90
+ "city": city,
91
+ "postcode": postcode,
92
+ "district": district,
93
+ "quarter": quarter,
94
+ "street": raw_address.get("road"),
95
+ "number": raw_address.get("house_number"),
96
+ }
97
+
98
+ def _parse_geonames_result(self, geonames_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
99
+ geonames_country_code_str = geonames_raw_json.get("country_code")
100
+ country_name = None
101
+ if geonames_country_code_str:
102
+ try:
103
+ country_obj = pycountry.countries.get(alpha_2=geonames_country_code_str.upper())
104
+ if country_obj:
105
+ country_name = spanish.gettext(country_obj.name)
106
+ except LookupError:
107
+ logger.warning(f"Country name not found for code: {geonames_country_code_str} using pycountry.")
108
+
109
+ postcode_str = str(geonames_raw_json.get("postal_code", ""))
110
+ postcode = self.geonames.validate_postcode(postcode_str)
111
+ province = self.geonames.get_province_from_postcode(postcode) if postcode else None
112
+ city = geonames_raw_json.get("place_name")
113
+
114
+ return {
115
+ "country": country_name,
116
+ "country_code": (geonames_country_code_str or "").lower(),
117
+ "state": geonames_raw_json.get("community"),
118
+ "province": province,
119
+ "city": city,
120
+ "postcode": postcode,
121
+ "district": None,
122
+ "quarter": None,
123
+ "street": None,
124
+ "number": None,
125
+ }
126
+
127
+ def _get_empty_address_result(self) -> Dict[str, None]:
128
+ return {
129
+ "country": None,
130
+ "country_code": None,
131
+ "state": None,
132
+ "province": None,
133
+ "city": None,
134
+ "postcode": None,
135
+ "district": None,
136
+ "quarter": None,
137
+ "street": None,
138
+ "number": None,
139
+ }
140
+
141
+ def _select_postcode_and_derived_province(
142
+ self, parsed_nominatim_result: Dict[str, Optional[str]], parsed_geonames_result: Dict[str, Optional[str]], nominatim_address_province_raw: Optional[str]
143
+ ) -> Tuple[Optional[str], Optional[str]]:
144
+ """
145
+ Determines the postcode and its derived province based on comparisons
146
+ between Nominatim and GeoNames data, and Nominatim's raw address province.
147
+ """
148
+ nominatim_postcode = parsed_nominatim_result.get("postcode")
149
+ geonames_postcode = parsed_geonames_result.get("postcode")
150
+
151
+ province_from_nominatim_postcode = self.geonames.get_province_from_postcode(nominatim_postcode)
152
+ province_from_geonames_postcode = self.geonames.get_province_from_postcode(geonames_postcode)
153
+
154
+ norm_raw_nominatim_province = (
155
+ normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
156
+ )
157
+ norm_province_from_nominatim_postcode = (
158
+ normalize(province_from_nominatim_postcode) if province_from_nominatim_postcode else ""
159
+ )
160
+ norm_province_from_geonames_postcode = (
161
+ normalize(province_from_geonames_postcode) if province_from_geonames_postcode else ""
162
+ )
163
+
164
+ selected_postcode = None
165
+ selected_province_from_postcode = None
166
+
167
+ # If provinces derived from Nominatim and GeoNames postcodes differ
168
+ nominatim_postcode_province_matches = False
169
+ if norm_province_from_nominatim_postcode and norm_raw_nominatim_province:
170
+ nominatim_postcode_province_matches = (
171
+ jaro_winkler_similarity(norm_province_from_nominatim_postcode, norm_raw_nominatim_province)
172
+ > JARO_WINKLER_THRESHOLD
173
+ )
174
+
175
+ geonames_postcode_province_matches = False
176
+ if norm_province_from_geonames_postcode and norm_raw_nominatim_province:
177
+ geonames_postcode_province_matches = (
178
+ jaro_winkler_similarity(norm_province_from_geonames_postcode, norm_raw_nominatim_province)
179
+ > JARO_WINKLER_THRESHOLD
180
+ )
181
+
182
+ # Prefer GeoNames postcode if its province matches Nominatim's raw address province,
183
+ # and Nominatim's own postcode-derived province does not.
184
+ if nominatim_postcode_province_matches:
185
+ selected_postcode = nominatim_postcode
186
+ selected_province_from_postcode = province_from_nominatim_postcode
187
+ if geonames_postcode_province_matches and not nominatim_postcode_province_matches:
188
+ selected_postcode = geonames_postcode
189
+ selected_province_from_postcode = province_from_geonames_postcode
190
+
191
+ return selected_postcode, selected_province_from_postcode
192
+
193
+ def _select_final_result(
194
+ self,
195
+ parsed_nominatim_result: Dict[str, Optional[str]],
196
+ parsed_geonames_result: Dict[str, Optional[str]],
197
+ dist_nominatim: float,
198
+ dist_geonames: float,
199
+ authoritative_postcode: Optional[str],
200
+ authoritative_province_from_postcode: Optional[str],
201
+ nominatim_address_province_raw: Optional[str],
202
+ ) -> Dict[str, Optional[str]]:
203
+ """
204
+ Selects the final address result based on distances and applies the authoritative postcode/province.
205
+ """
206
+ if dist_nominatim <= dist_geonames and dist_nominatim != float("inf"):
207
+ final_result = parsed_nominatim_result
208
+ final_result["postcode"] = authoritative_postcode
209
+ final_result["province"] = nominatim_address_province_raw
210
+ elif dist_geonames < dist_nominatim and dist_geonames != float("inf"):
211
+ final_result = parsed_geonames_result
212
+ final_result["postcode"] = authoritative_postcode
213
+ final_result["province"] = authoritative_province_from_postcode
214
+ else:
215
+ final_result = self._get_empty_address_result()
216
+ return final_result
217
+
218
+ def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
219
+ district = self._get_attribute(raw_json, ["city_district", "suburb", "borough"])
220
+ quarter = self._get_attribute(raw_json, ["quarter", "neighbourhood"])
60
221
 
61
- def geocode_parsed(self, address):
222
+ if not district and quarter:
223
+ district = quarter
224
+ quarter = None
225
+
226
+ return district, quarter
227
+
228
+ def geocode(self, address: str) -> List[Dict[str, Any]]:
229
+ return requests.get(f"{self.endpoint}/search?q={address}&format=json", timeout=30).json()
230
+
231
+ def geocode_parsed(self, address: str) -> Optional[Dict[str, Optional[str]]]:
62
232
  results = self.geocode(address)
63
233
 
64
234
  if results:
65
235
  return self.reverse_parsed(results[0]["lat"], results[0]["lon"])
66
236
 
67
- def reverse(self, lat, lon):
68
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json").json()
237
+ def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
238
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json", timeout=30).json()
239
+
240
+ def reverse_parsed(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Optional[str]]:
241
+ nominatim_response = self.reverse(lat, lon)
242
+ geonames_response = self.geonames.reverse(lat, lon)
69
243
 
70
- def reverse_parsed(self, lat, lon):
71
- nominatim_raw_json = self.reverse(lat, lon)
72
- geonames_raw_json = self.geonames.reverse(lat, lon)
244
+ # Initial parsing
245
+ parsed_nominatim_result = self._parse_nominatim_result(nominatim_response)
246
+ parsed_geonames_result = self._parse_geonames_result(geonames_response)
73
247
 
74
- nominatim_res_lat_str = nominatim_raw_json.get("lat")
75
- nominatim_res_lon_str = nominatim_raw_json.get("lon")
76
- geonames_res_lat_str = geonames_raw_json.get("lat")
77
- geonames_res_lon_str = geonames_raw_json.get("lon")
248
+ # Determine authoritative postcode
249
+ raw_nominatim_province = nominatim_response.get("address", {}).get("province")
250
+ selected_postcode, selected_province_from_postcode = self._select_postcode_and_derived_province(
251
+ parsed_nominatim_result, parsed_geonames_result, raw_nominatim_province
252
+ )
78
253
 
79
- dist_nominatim = float("inf")
80
- dist_geonames = float("inf")
254
+ # Calculate distances
255
+ nominatim_response_lat = nominatim_response.get("lat")
256
+ nominatim_response_lon = nominatim_response.get("lon")
257
+ geonames_response_lat = geonames_response.get("lat")
258
+ geonames_response_lon = geonames_response.get("lon")
81
259
 
260
+ input_coords = None
82
261
  try:
83
262
  input_coords = (float(lat), float(lon))
84
263
  except (ValueError, TypeError):
85
264
  logger.error(f"Invalid input coordinates for distance calculation: lat={lat}, lon={lon}")
86
- else:
87
- if nominatim_res_lat_str and nominatim_res_lon_str:
88
- try:
89
- nominatim_coords = (float(nominatim_res_lat_str), float(nominatim_res_lon_str))
90
- dist_nominatim = geodesic(input_coords, nominatim_coords).km
91
- except (ValueError, TypeError):
92
- logger.warning("Invalid Nominatim coordinates for distance calculation.")
93
-
94
- if geonames_res_lat_str and geonames_res_lon_str:
95
- try:
96
- geonames_coords = (float(geonames_res_lat_str), float(geonames_res_lon_str))
97
- dist_geonames = geodesic(input_coords, geonames_coords).km
98
- except (ValueError, TypeError):
99
- logger.warning("Invalid GeoNames coordinates for distance calculation.")
100
-
101
- if dist_nominatim <= dist_geonames and nominatim_res_lat_str is not None and nominatim_res_lon_str is not None:
102
- # Use Nominatim data
103
- raw_address = nominatim_raw_json.get("address", {})
104
- postcode_str = str(raw_address.get("postcode", ""))
105
- postcode = self.geonames.validate_postcode(postcode_str)
106
- province = self.geonames.get_province_from_postcode(postcode) if postcode else None
107
- city = self.get_attribute(raw_address, ["city", "town", "village"])
108
- district, quarter = self.get_district_quarter(raw_address)
109
-
110
- return {
111
- "country": raw_address.get("country"),
112
- "country_code": (raw_address.get("country_code") or "").lower(),
113
- "state": raw_address.get("state"),
114
- "province": province,
115
- "city": city,
116
- "postcode": postcode,
117
- "district": district,
118
- "quarter": quarter,
119
- "street": raw_address.get("road"),
120
- "number": raw_address.get("house_number"),
121
- }
122
-
123
- elif dist_geonames < dist_nominatim and geonames_res_lat_str is not None and geonames_res_lon_str is not None:
124
- # Use GeoNames data
125
- geonames_country_code_str = geonames_raw_json.get("country_code")
126
- country_name = None
127
- if geonames_country_code_str:
128
- try:
129
- country_obj = pycountry.countries.get(alpha_2=geonames_country_code_str.upper())
130
- if country_obj:
131
- country_name = spanish.gettext(country_obj.name)
132
- except LookupError:
133
- logger.warning(f"Country name not found for code: {geonames_country_code_str} using pycountry.")
134
-
135
- postcode_str = str(geonames_raw_json.get("postal_code", ""))
136
- postcode = self.geonames.validate_postcode(postcode_str)
137
- province = self.geonames.get_province_from_postcode(postcode) if postcode else None
138
- city = geonames_raw_json.get("place_name")
139
-
140
- return {
141
- "country": country_name,
142
- "country_code": (geonames_country_code_str or "").lower(),
143
- "state": geonames_raw_json.get("community"),
144
- "province": province,
145
- "city": city,
146
- "postcode": postcode,
147
- "district": None,
148
- "quarter": None,
149
- "street": None,
150
- "number": None,
151
- }
265
+ return self._get_empty_address_result()
152
266
 
153
- else:
154
- # Neither source provided valid coordinates
155
- return {
156
- "country": None,
157
- "country_code": None,
158
- "state": None,
159
- "province": None,
160
- "city": None,
161
- "postcode": None,
162
- "district": None,
163
- "quarter": None,
164
- "street": None,
165
- "number": None,
166
- }
167
-
168
- def get_district_quarter(self, raw_json):
169
- district = self.get_attribute(raw_json, ["city_district", "suburb", "borough"])
170
- quarter = self.get_attribute(raw_json, ["quarter", "neighbourhood"])
267
+ dist_nominatim = self._calculate_distance(nominatim_response_lat, nominatim_response_lon, input_coords)
268
+ dist_geonames = self._calculate_distance(geonames_response_lat, geonames_response_lon, input_coords)
171
269
 
172
- if not district and quarter:
173
- district = quarter
174
- quarter = None
270
+ # Select final result
271
+ final_result = self._select_final_result(
272
+ parsed_nominatim_result,
273
+ parsed_geonames_result,
274
+ dist_nominatim,
275
+ dist_geonames,
276
+ selected_postcode,
277
+ selected_province_from_postcode,
278
+ raw_nominatim_province,
279
+ )
175
280
 
176
- return district, quarter
281
+ return final_result
177
282
 
178
283
 
179
284
  class NominatimInterface(Nominatim):
180
- def __init__(self, config):
285
+ def __init__(self, config: Dict[str, Any]) -> None:
181
286
  if "osm" in config:
182
287
  self.config = config["osm"]
183
288
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.28
3
+ Version: 0.9.29
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -83,6 +83,7 @@ Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oau
83
83
  Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
84
84
  Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
85
85
  Requires-Dist: inflection (>=0.5.0,<0.6.0)
86
+ Requires-Dist: jellyfish (>=1.0.0,<2.0.0)
86
87
  Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
87
88
  Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
88
89
  Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
@@ -4,7 +4,7 @@ datamarket/interfaces/alchemy.py,sha256=4q_gLKCKPK437VKOpdBKSrCyy42P_yWxIhE7KuvH
4
4
  datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
5
5
  datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
6
  datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
7
- datamarket/interfaces/nominatim.py,sha256=TD4OhWdIwn53Va41BS1ugogHEZw0ANKxWIfc9G2JWLU,7280
7
+ datamarket/interfaces/nominatim.py,sha256=rUnodcRKyZ_reBtyfFFjXNqP1TN0NMScW7zSGiJQ10I,12380
8
8
  datamarket/interfaces/peerdb.py,sha256=cwYwvO740GyaPo9zLAwJsf3UeJDGDiYzjQVM9Q6s-_g,23652
9
9
  datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
10
10
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
@@ -19,7 +19,7 @@ datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
19
  datamarket/utils/strings.py,sha256=rEX9NeBG4C7RECgT0EQebgoFoxgZMy9-7EcBSxgBANU,5654
20
20
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
21
21
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
22
- datamarket-0.9.28.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
23
- datamarket-0.9.28.dist-info/METADATA,sha256=5V61eLQ9HR_8Tl7J0sRKwR_mzKbJctIjvEebNtPtqug,6546
24
- datamarket-0.9.28.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
25
- datamarket-0.9.28.dist-info/RECORD,,
22
+ datamarket-0.9.29.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
23
+ datamarket-0.9.29.dist-info/METADATA,sha256=h4DuPT0ToLAN6vSLidYyriB9gtKjjaDPcf2MWH5fm44,6588
24
+ datamarket-0.9.29.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
25
+ datamarket-0.9.29.dist-info/RECORD,,