datamarket 0.9.27__py3-none-any.whl → 0.9.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -1,24 +1,37 @@
1
1
  ########################################################################################################################
2
2
  # IMPORTS
3
3
 
4
+ import gettext
4
5
  import logging
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
7
 
8
+ import pycountry
6
9
  import requests
10
+ from geopy.distance import geodesic
11
+ from jellyfish import jaro_winkler_similarity
7
12
 
8
13
  from ..params.nominatim import POSTCODES
14
+ from ..utils.strings import normalize
15
+
16
+ ########################################################################################################################
17
+ # PARAMETERS
18
+
19
+ JARO_WINKLER_THRESHOLD = 0.85
9
20
 
10
21
  ########################################################################################################################
11
22
  # CLASSES
12
23
 
13
24
  logger = logging.getLogger(__name__)
25
+ spanish = gettext.translation("iso3166-1", pycountry.LOCALES_DIR, languages=["es"])
26
+ spanish.install()
14
27
 
15
28
 
16
29
  class GeoNames:
17
- def __init__(self, endpoint):
30
+ def __init__(self, endpoint: str) -> None:
18
31
  self.endpoint = endpoint
19
32
 
20
33
  @staticmethod
21
- def validate_postcode(postcode):
34
+ def validate_postcode(postcode: Union[int, str]) -> Optional[str]:
22
35
  if isinstance(postcode, int):
23
36
  postcode = str(postcode)
24
37
 
@@ -31,64 +44,180 @@ class GeoNames:
31
44
  return postcode
32
45
 
33
46
  @staticmethod
34
- def get_province_from_postcode(postcode):
47
+ def get_province_from_postcode(postcode: Optional[str]) -> Optional[str]:
35
48
  if postcode:
36
49
  return POSTCODES[postcode[:2]]
37
50
 
38
- def reverse(self, lat, lon):
39
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}").json()
51
+ def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
52
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}", timeout=30).json()
40
53
 
41
54
 
42
55
  class Nominatim:
43
- def __init__(self, nominatim_endpoint, geonames_endpoint):
56
+ def __init__(self, nominatim_endpoint: str, geonames_endpoint: str) -> None:
44
57
  self.endpoint = nominatim_endpoint
45
58
  self.geonames = GeoNames(geonames_endpoint)
46
59
 
47
60
  @staticmethod
48
- def get_attribute(raw_json, keys):
61
+ def _get_attribute(raw_json: Dict[str, Any], keys: List[str]) -> Any:
49
62
  for key in keys:
50
63
  if key in raw_json:
51
64
  return raw_json[key]
52
65
 
53
- def geocode(self, address):
54
- return requests.get(f"{self.endpoint}/search?q={address}&format=json").json()
66
+ def _calculate_distance(self, lat_str: Optional[str], lon_str: Optional[str], input_coords: Tuple[float, float]) -> float:
67
+ dist = float("inf")
68
+ if lat_str and lon_str:
69
+ try:
70
+ coords = (float(lat_str), float(lon_str))
71
+ dist = geodesic(input_coords, coords).km
72
+ except (ValueError, TypeError):
73
+ logger.warning("Invalid coordinates for distance calculation.")
74
+ return dist
55
75
 
56
- def geocode_parsed(self, address):
57
- results = self.geocode(address)
76
+ def _parse_nominatim_result(self, nominatim_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
77
+ raw_address = nominatim_raw_json.get("address", {})
58
78
 
59
- if results:
60
- return self.reverse_parsed(results[0]["lat"], results[0]["lon"])
79
+ postcode_str = str(raw_address.get("postcode", ""))
80
+ postcode = self.geonames.validate_postcode(postcode_str)
61
81
 
62
- def reverse(self, lat, lon):
63
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json").json()
82
+ city = self._get_attribute(raw_address, ["city", "town", "village"])
83
+ district, quarter = self._get_district_quarter(raw_address)
64
84
 
65
- def reverse_parsed(self, lat, lon):
66
- raw_json = self.reverse(lat, lon).get("address", {})
67
- geoname = self.geonames.reverse(lat, lon)
85
+ return {
86
+ "country": raw_address.get("country"),
87
+ "country_code": (raw_address.get("country_code") or "").lower(),
88
+ "state": raw_address.get("state"),
89
+ "province": raw_address.get("province"),
90
+ "city": city,
91
+ "postcode": postcode,
92
+ "district": district,
93
+ "quarter": quarter,
94
+ "street": raw_address.get("road"),
95
+ "number": raw_address.get("house_number"),
96
+ }
68
97
 
69
- postcode = self.geonames.validate_postcode(
70
- str(geoname.get("postal_code", ""))
71
- ) or self.geonames.validate_postcode(str(raw_json.get("postcode")))
98
+ def _parse_geonames_result(self, geonames_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
99
+ geonames_country_code_str = geonames_raw_json.get("country_code")
100
+ country_name = None
101
+ if geonames_country_code_str:
102
+ try:
103
+ country_obj = pycountry.countries.get(alpha_2=geonames_country_code_str.upper())
104
+ if country_obj:
105
+ country_name = spanish.gettext(country_obj.name)
106
+ except LookupError:
107
+ logger.warning(f"Country name not found for code: {geonames_country_code_str} using pycountry.")
72
108
 
73
- city = self.get_attribute(raw_json, ["city", "town", "village"]) or geoname.get("place_name")
109
+ postcode_str = str(geonames_raw_json.get("postal_code", ""))
110
+ postcode = self.geonames.validate_postcode(postcode_str)
111
+ province = self.geonames.get_province_from_postcode(postcode) if postcode else None
112
+ city = geonames_raw_json.get("place_name")
74
113
 
75
- district, quarter = self.get_district_quarter(raw_json)
76
114
  return {
77
- "country": raw_json.get("country"),
78
- "country_code": (raw_json.get("country_code") or geoname.get("country_code") or "").lower(),
79
- "state": raw_json.get("state") or geoname.get("community"),
80
- "province": self.geonames.get_province_from_postcode(postcode),
115
+ "country": country_name,
116
+ "country_code": (geonames_country_code_str or "").lower(),
117
+ "state": geonames_raw_json.get("community"),
118
+ "province": province,
81
119
  "city": city,
82
120
  "postcode": postcode,
83
- "district": district,
84
- "quarter": quarter,
85
- "street": raw_json.get("road"),
86
- "number": raw_json.get("house_number"),
121
+ "district": None,
122
+ "quarter": None,
123
+ "street": None,
124
+ "number": None,
125
+ }
126
+
127
+ def _get_empty_address_result(self) -> Dict[str, None]:
128
+ return {
129
+ "country": None,
130
+ "country_code": None,
131
+ "state": None,
132
+ "province": None,
133
+ "city": None,
134
+ "postcode": None,
135
+ "district": None,
136
+ "quarter": None,
137
+ "street": None,
138
+ "number": None,
87
139
  }
88
140
 
89
- def get_district_quarter(self, raw_json):
90
- district = self.get_attribute(raw_json, ["city_district", "suburb", "borough"])
91
- quarter = self.get_attribute(raw_json, ["quarter", "neighbourhood"])
141
+ def _select_postcode_and_derived_province(
142
+ self, parsed_nominatim_result: Dict[str, Optional[str]], parsed_geonames_result: Dict[str, Optional[str]], nominatim_address_province_raw: Optional[str]
143
+ ) -> Tuple[Optional[str], Optional[str]]:
144
+ """
145
+ Determines the postcode and its derived province based on comparisons
146
+ between Nominatim and GeoNames data, and Nominatim's raw address province.
147
+ """
148
+ nominatim_postcode = parsed_nominatim_result.get("postcode")
149
+ geonames_postcode = parsed_geonames_result.get("postcode")
150
+
151
+ province_from_nominatim_postcode = self.geonames.get_province_from_postcode(nominatim_postcode)
152
+ province_from_geonames_postcode = self.geonames.get_province_from_postcode(geonames_postcode)
153
+
154
+ norm_raw_nominatim_province = (
155
+ normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
156
+ )
157
+ norm_province_from_nominatim_postcode = (
158
+ normalize(province_from_nominatim_postcode) if province_from_nominatim_postcode else ""
159
+ )
160
+ norm_province_from_geonames_postcode = (
161
+ normalize(province_from_geonames_postcode) if province_from_geonames_postcode else ""
162
+ )
163
+
164
+ selected_postcode = None
165
+ selected_province_from_postcode = None
166
+
167
+ # If provinces derived from Nominatim and GeoNames postcodes differ
168
+ nominatim_postcode_province_matches = False
169
+ if norm_province_from_nominatim_postcode and norm_raw_nominatim_province:
170
+ nominatim_postcode_province_matches = (
171
+ jaro_winkler_similarity(norm_province_from_nominatim_postcode, norm_raw_nominatim_province)
172
+ > JARO_WINKLER_THRESHOLD
173
+ )
174
+
175
+ geonames_postcode_province_matches = False
176
+ if norm_province_from_geonames_postcode and norm_raw_nominatim_province:
177
+ geonames_postcode_province_matches = (
178
+ jaro_winkler_similarity(norm_province_from_geonames_postcode, norm_raw_nominatim_province)
179
+ > JARO_WINKLER_THRESHOLD
180
+ )
181
+
182
+ # Prefer GeoNames postcode if its province matches Nominatim's raw address province,
183
+ # and Nominatim's own postcode-derived province does not.
184
+ if nominatim_postcode_province_matches:
185
+ selected_postcode = nominatim_postcode
186
+ selected_province_from_postcode = province_from_nominatim_postcode
187
+ if geonames_postcode_province_matches and not nominatim_postcode_province_matches:
188
+ selected_postcode = geonames_postcode
189
+ selected_province_from_postcode = province_from_geonames_postcode
190
+
191
+ return selected_postcode, selected_province_from_postcode
192
+
193
+ def _select_final_result(
194
+ self,
195
+ parsed_nominatim_result: Dict[str, Optional[str]],
196
+ parsed_geonames_result: Dict[str, Optional[str]],
197
+ dist_nominatim: float,
198
+ dist_geonames: float,
199
+ authoritative_postcode: Optional[str],
200
+ authoritative_province_from_postcode: Optional[str],
201
+ nominatim_address_province_raw: Optional[str],
202
+ ) -> Dict[str, Optional[str]]:
203
+ """
204
+ Selects the final address result based on distances and applies the authoritative postcode/province.
205
+ """
206
+ if dist_nominatim <= dist_geonames and dist_nominatim != float("inf"):
207
+ final_result = parsed_nominatim_result
208
+ final_result["postcode"] = authoritative_postcode
209
+ final_result["province"] = nominatim_address_province_raw
210
+ elif dist_geonames < dist_nominatim and dist_geonames != float("inf"):
211
+ final_result = parsed_geonames_result
212
+ final_result["postcode"] = authoritative_postcode
213
+ final_result["province"] = authoritative_province_from_postcode
214
+ else:
215
+ final_result = self._get_empty_address_result()
216
+ return final_result
217
+
218
+ def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
219
+ district = self._get_attribute(raw_json, ["city_district", "suburb", "borough"])
220
+ quarter = self._get_attribute(raw_json, ["quarter", "neighbourhood"])
92
221
 
93
222
  if not district and quarter:
94
223
  district = quarter
@@ -96,9 +225,64 @@ class Nominatim:
96
225
 
97
226
  return district, quarter
98
227
 
228
+ def geocode(self, address: str) -> List[Dict[str, Any]]:
229
+ return requests.get(f"{self.endpoint}/search?q={address}&format=json", timeout=30).json()
230
+
231
+ def geocode_parsed(self, address: str) -> Optional[Dict[str, Optional[str]]]:
232
+ results = self.geocode(address)
233
+
234
+ if results:
235
+ return self.reverse_parsed(results[0]["lat"], results[0]["lon"])
236
+
237
+ def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
238
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json", timeout=30).json()
239
+
240
+ def reverse_parsed(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Optional[str]]:
241
+ nominatim_response = self.reverse(lat, lon)
242
+ geonames_response = self.geonames.reverse(lat, lon)
243
+
244
+ # Initial parsing
245
+ parsed_nominatim_result = self._parse_nominatim_result(nominatim_response)
246
+ parsed_geonames_result = self._parse_geonames_result(geonames_response)
247
+
248
+ # Determine authoritative postcode
249
+ raw_nominatim_province = nominatim_response.get("address", {}).get("province")
250
+ selected_postcode, selected_province_from_postcode = self._select_postcode_and_derived_province(
251
+ parsed_nominatim_result, parsed_geonames_result, raw_nominatim_province
252
+ )
253
+
254
+ # Calculate distances
255
+ nominatim_response_lat = nominatim_response.get("lat")
256
+ nominatim_response_lon = nominatim_response.get("lon")
257
+ geonames_response_lat = geonames_response.get("lat")
258
+ geonames_response_lon = geonames_response.get("lon")
259
+
260
+ input_coords = None
261
+ try:
262
+ input_coords = (float(lat), float(lon))
263
+ except (ValueError, TypeError):
264
+ logger.error(f"Invalid input coordinates for distance calculation: lat={lat}, lon={lon}")
265
+ return self._get_empty_address_result()
266
+
267
+ dist_nominatim = self._calculate_distance(nominatim_response_lat, nominatim_response_lon, input_coords)
268
+ dist_geonames = self._calculate_distance(geonames_response_lat, geonames_response_lon, input_coords)
269
+
270
+ # Select final result
271
+ final_result = self._select_final_result(
272
+ parsed_nominatim_result,
273
+ parsed_geonames_result,
274
+ dist_nominatim,
275
+ dist_geonames,
276
+ selected_postcode,
277
+ selected_province_from_postcode,
278
+ raw_nominatim_province,
279
+ )
280
+
281
+ return final_result
282
+
99
283
 
100
284
  class NominatimInterface(Nominatim):
101
- def __init__(self, config):
285
+ def __init__(self, config: Dict[str, Any]) -> None:
102
286
  if "osm" in config:
103
287
  self.config = config["osm"]
104
288
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.27
3
+ Version: 0.9.29
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -28,7 +28,6 @@ Provides-Extra: duckduckgo-search
28
28
  Provides-Extra: fake-useragent
29
29
  Provides-Extra: geoalchemy2
30
30
  Provides-Extra: geopandas
31
- Provides-Extra: geopy
32
31
  Provides-Extra: google-api-python-client
33
32
  Provides-Extra: google-auth-httplib2
34
33
  Provides-Extra: google-auth-oauthlib
@@ -77,13 +76,14 @@ Requires-Dist: dynaconf (>=3.0.0,<4.0.0)
77
76
  Requires-Dist: fake-useragent (>=2.0.0,<3.0.0) ; extra == "fake-useragent"
78
77
  Requires-Dist: geoalchemy2 (>=0.17.0,<0.18.0) ; extra == "geoalchemy2"
79
78
  Requires-Dist: geopandas (>=1.0.0,<2.0.0) ; extra == "geopandas"
80
- Requires-Dist: geopy (>=2.0.0,<3.0.0) ; extra == "geopy"
79
+ Requires-Dist: geopy (>=2.0.0,<3.0.0)
81
80
  Requires-Dist: google-api-python-client (>=2.0.0,<3.0.0) ; extra == "google-api-python-client"
82
81
  Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-httplib2"
83
82
  Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
84
83
  Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
85
84
  Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
86
85
  Requires-Dist: inflection (>=0.5.0,<0.6.0)
86
+ Requires-Dist: jellyfish (>=1.0.0,<2.0.0)
87
87
  Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
88
88
  Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
89
89
  Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
@@ -98,6 +98,7 @@ Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
98
98
  Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
99
99
  Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
100
100
  Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
101
+ Requires-Dist: pycountry (>=24.0.0,<25.0.0)
101
102
  Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
102
103
  Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
103
104
  Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
@@ -4,7 +4,7 @@ datamarket/interfaces/alchemy.py,sha256=4q_gLKCKPK437VKOpdBKSrCyy42P_yWxIhE7KuvH
4
4
  datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
5
5
  datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
6
  datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
7
- datamarket/interfaces/nominatim.py,sha256=ysIA2J1GhsZ0TJxD6B8N1_a7dkMEqtZQV6mT4Hayecg,3672
7
+ datamarket/interfaces/nominatim.py,sha256=rUnodcRKyZ_reBtyfFFjXNqP1TN0NMScW7zSGiJQ10I,12380
8
8
  datamarket/interfaces/peerdb.py,sha256=cwYwvO740GyaPo9zLAwJsf3UeJDGDiYzjQVM9Q6s-_g,23652
9
9
  datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
10
10
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
@@ -19,7 +19,7 @@ datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
19
  datamarket/utils/strings.py,sha256=rEX9NeBG4C7RECgT0EQebgoFoxgZMy9-7EcBSxgBANU,5654
20
20
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
21
21
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
22
- datamarket-0.9.27.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
23
- datamarket-0.9.27.dist-info/METADATA,sha256=ZDopWDfk3f0HeTZSVAKSnAmfOPSBUOJNlos9fqGzKJA,6543
24
- datamarket-0.9.27.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
25
- datamarket-0.9.27.dist-info/RECORD,,
22
+ datamarket-0.9.29.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
23
+ datamarket-0.9.29.dist-info/METADATA,sha256=h4DuPT0ToLAN6vSLidYyriB9gtKjjaDPcf2MWH5fm44,6588
24
+ datamarket-0.9.29.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
25
+ datamarket-0.9.29.dist-info/RECORD,,