datamarket 0.9.39__py3-none-any.whl → 0.9.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -17,6 +17,7 @@ from ..utils.strings import normalize
17
17
  # PARAMETERS
18
18
 
19
19
  JARO_WINKLER_THRESHOLD = 0.85
20
+ CLOSE_KM = 2.0
20
21
 
21
22
  ########################################################################################################################
22
23
  # CLASSES
@@ -145,55 +146,86 @@ class Nominatim:
145
146
  parsed_nominatim_result: Dict[str, Optional[str]],
146
147
  parsed_geonames_result: Dict[str, Optional[str]],
147
148
  nominatim_address_province_raw: Optional[str],
148
- ) -> Tuple[Optional[str], Optional[str]]:
149
+ dist_nominatim: float, # distance Nominatim ↔ input (km)
150
+ dist_geonames: float, # distance GeoNames ↔ input (km)
151
+ ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
149
152
  """
150
- Determines the postcode and its derived province based on comparisons
151
- between Nominatim and GeoNames data, and Nominatim's raw address province.
153
+ Decide the authoritative postcode, the province derived from it and the associated state.
154
+
155
+ Strategy:
156
+ 1. Derive province from each postcode.
157
+ 2. Validate each postcode–province pair:
158
+ • Nominatim: compare with raw province string (if present).
159
+ • GeoNames: multi-step validation (raw province, then Nominatim-derived
160
+ province when Nominatim coords are close, then distance fallback).
161
+ 3. Return the postcode/province that passes validation with precedence:
162
+ Nominatim > GeoNames. Returns (None, None, None) if neither passes.
152
163
  """
164
+
165
+ # --- Extract postcodes ---
153
166
  nominatim_postcode = parsed_nominatim_result.get("postcode")
154
167
  geonames_postcode = parsed_geonames_result.get("postcode")
155
168
 
156
- province_from_nominatim_postcode = self.geonames.get_province_from_postcode(nominatim_postcode)
157
- province_from_geonames_postcode = self.geonames.get_province_from_postcode(geonames_postcode)
169
+ # --- Province derived from each postcode ---
170
+ province_from_nominatim_pc = self.geonames.get_province_from_postcode(nominatim_postcode)
171
+ province_from_geonames_pc = self.geonames.get_province_from_postcode(geonames_postcode)
158
172
 
159
- norm_raw_nominatim_province = (
160
- normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
161
- )
162
- norm_province_from_nominatim_postcode = (
163
- normalize(province_from_nominatim_postcode) if province_from_nominatim_postcode else ""
164
- )
165
- norm_province_from_geonames_postcode = (
166
- normalize(province_from_geonames_postcode) if province_from_geonames_postcode else ""
167
- )
173
+ # --- Normalised strings for similarity comparisons ---
174
+ norm_raw_province = normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
175
+ norm_province_from_nominatim_pc = normalize(province_from_nominatim_pc) if province_from_nominatim_pc else ""
176
+ norm_province_from_geonames_pc = normalize(province_from_geonames_pc) if province_from_geonames_pc else ""
168
177
 
169
- selected_postcode = None
170
- selected_province_from_postcode = None
178
+ # --- Distance heuristics ---
179
+ nominatim_is_close = dist_nominatim < CLOSE_KM
180
+ geonames_is_close = dist_geonames < CLOSE_KM
171
181
 
172
- # If provinces derived from Nominatim and GeoNames postcodes differ
173
- nominatim_postcode_province_matches = False
174
- if norm_province_from_nominatim_postcode and norm_raw_nominatim_province:
175
- nominatim_postcode_province_matches = (
176
- jaro_winkler_similarity(norm_province_from_nominatim_postcode, norm_raw_nominatim_province)
177
- > JARO_WINKLER_THRESHOLD
182
+ # --- Validate Nominatim postcode ---
183
+ nominatim_pc_valid = False
184
+ if norm_province_from_nominatim_pc and norm_raw_province:
185
+ nominatim_pc_valid = (
186
+ jaro_winkler_similarity(norm_province_from_nominatim_pc, norm_raw_province) > JARO_WINKLER_THRESHOLD
178
187
  )
179
188
 
180
- geonames_postcode_province_matches = False
181
- if norm_province_from_geonames_postcode and norm_raw_nominatim_province:
182
- geonames_postcode_province_matches = (
183
- jaro_winkler_similarity(norm_province_from_geonames_postcode, norm_raw_nominatim_province)
184
- > JARO_WINKLER_THRESHOLD
185
- )
189
+ # --- Validate GeoNames postcode ---
190
+ geonames_pc_valid = False
186
191
 
187
- # Prefer GeoNames postcode if its province matches Nominatim's raw address province,
188
- # and Nominatim's own postcode-derived province does not.
189
- if nominatim_postcode_province_matches:
190
- selected_postcode = nominatim_postcode
191
- selected_province_from_postcode = province_from_nominatim_postcode
192
- if geonames_postcode_province_matches and not nominatim_postcode_province_matches:
193
- selected_postcode = geonames_postcode
194
- selected_province_from_postcode = province_from_geonames_postcode
192
+ # 1) Compare with raw province string (if exists)
193
+ if norm_province_from_geonames_pc and norm_raw_province:
194
+ geonames_pc_valid = (
195
+ jaro_winkler_similarity(norm_province_from_geonames_pc, norm_raw_province) > JARO_WINKLER_THRESHOLD
196
+ )
195
197
 
196
- return selected_postcode, selected_province_from_postcode
198
+ # 2) If no raw province, compare with province from Nominatim PC **only when** Nominatim is close
199
+ if not geonames_pc_valid and not norm_raw_province and nominatim_is_close:
200
+ if norm_province_from_geonames_pc and norm_province_from_nominatim_pc:
201
+ geonames_pc_valid = (
202
+ jaro_winkler_similarity(norm_province_from_geonames_pc, norm_province_from_nominatim_pc)
203
+ > JARO_WINKLER_THRESHOLD
204
+ )
205
+
206
+ # 3) Fallback: accept GeoNames PC if its coordinates are very close
207
+ if not geonames_pc_valid and geonames_is_close and geonames_postcode:
208
+ geonames_pc_valid = True
209
+
210
+ # --- Select authoritative tuple ---
211
+ postcode = None
212
+ province = None
213
+ state = None
214
+
215
+ if nominatim_pc_valid:
216
+ postcode = nominatim_postcode
217
+ province = province_from_nominatim_pc
218
+ state = parsed_nominatim_result.get("state")
219
+ if not state and geonames_pc_valid:
220
+ state = parsed_geonames_result.get("state")
221
+ elif geonames_pc_valid:
222
+ postcode = geonames_postcode
223
+ province = province_from_geonames_pc
224
+ state = parsed_geonames_result.get("state")
225
+ if not state and nominatim_pc_valid:
226
+ state = parsed_nominatim_result.get("state")
227
+
228
+ return postcode, province, state
197
229
 
198
230
  def _select_final_result(
199
231
  self,
@@ -203,21 +235,41 @@ class Nominatim:
203
235
  dist_geonames: float,
204
236
  authoritative_postcode: Optional[str],
205
237
  authoritative_province_from_postcode: Optional[str],
206
- nominatim_province: Optional[str],
238
+ authoritative_state: Optional[str],
207
239
  ) -> Dict[str, Optional[str]]:
208
240
  """
209
- Selects the final address result based on distances and applies the authoritative postcode/province.
241
+ Choose the address block (Nominatim vs GeoNames) based on distance,
242
+ then apply the authoritative postcode/province.
243
+
244
+ Rules:
245
+ • Pick the source with the smaller finite distance.
246
+ • Always overwrite 'postcode' if authoritative_postcode is present.
247
+ • Overwrite 'province' only when authoritative_province_from_postcode is not None.
248
+ • If both distances are ∞, return an empty address.
210
249
  """
250
+
251
+ # ------------------------------------------------------------------ #
252
+ # 1. Decide the base address block #
253
+ # ------------------------------------------------------------------ #
211
254
  if dist_nominatim <= dist_geonames and dist_nominatim != float("inf"):
212
255
  final_result = parsed_nominatim_result
213
- final_result["postcode"] = authoritative_postcode
214
- final_result["province"] = nominatim_province
215
256
  elif dist_geonames < dist_nominatim and dist_geonames != float("inf"):
216
257
  final_result = parsed_geonames_result
258
+ else:
259
+ return self._get_empty_address_result()
260
+
261
+ # ------------------------------------------------------------------ #
262
+ # 2. Apply authoritative postcode / province #
263
+ # ------------------------------------------------------------------ #
264
+ if authoritative_postcode:
217
265
  final_result["postcode"] = authoritative_postcode
266
+
267
+ if authoritative_province_from_postcode:
218
268
  final_result["province"] = authoritative_province_from_postcode
219
- else:
220
- final_result = self._get_empty_address_result()
269
+
270
+ if authoritative_province_from_postcode:
271
+ final_result["state"] = authoritative_state
272
+
221
273
  return final_result
222
274
 
223
275
  def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
@@ -250,12 +302,6 @@ class Nominatim:
250
302
  parsed_nominatim_result = self._parse_nominatim_result(nominatim_response)
251
303
  parsed_geonames_result = self._parse_geonames_result(geonames_response)
252
304
 
253
- # Determine authoritative postcode
254
- nominatim_province = parsed_nominatim_result.get("province")
255
- selected_postcode, selected_province_from_postcode = self._select_postcode_and_derived_province(
256
- parsed_nominatim_result, parsed_geonames_result, nominatim_province
257
- )
258
-
259
305
  # Calculate distances
260
306
  nominatim_response_lat = nominatim_response.get("lat")
261
307
  nominatim_response_lon = nominatim_response.get("lon")
@@ -272,6 +318,12 @@ class Nominatim:
272
318
  dist_nominatim = self._calculate_distance(nominatim_response_lat, nominatim_response_lon, input_coords)
273
319
  dist_geonames = self._calculate_distance(geonames_response_lat, geonames_response_lon, input_coords)
274
320
 
321
+ # Determine authoritative postcode
322
+ nominatim_province = parsed_nominatim_result.get("province")
323
+ selected_postcode, selected_province_from_postcode, selected_state = self._select_postcode_and_derived_province(
324
+ parsed_nominatim_result, parsed_geonames_result, nominatim_province, dist_nominatim, dist_geonames
325
+ )
326
+
275
327
  # Select final result
276
328
  final_result = self._select_final_result(
277
329
  parsed_nominatim_result,
@@ -280,7 +332,7 @@ class Nominatim:
280
332
  dist_geonames,
281
333
  selected_postcode,
282
334
  selected_province_from_postcode,
283
- nominatim_province,
335
+ selected_state,
284
336
  )
285
337
 
286
338
  return final_result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.39
3
+ Version: 0.9.40
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -4,7 +4,7 @@ datamarket/interfaces/alchemy.py,sha256=mQwjDqBpz1QHRV2JTCALvn5iK_ky69oE2Gw-EtRX
4
4
  datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
5
5
  datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
6
  datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
7
- datamarket/interfaces/nominatim.py,sha256=XzqBFVBzGU2BIFnFueZ56tk4JhaQgj5dvFalnCG6Zxk,12417
7
+ datamarket/interfaces/nominatim.py,sha256=d_KIrgzTusVYnw0Fk3YWCjrzlT9sI_bObGG-wOr__as,14726
8
8
  datamarket/interfaces/peerdb.py,sha256=cwYwvO740GyaPo9zLAwJsf3UeJDGDiYzjQVM9Q6s-_g,23652
9
9
  datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
10
10
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
@@ -24,7 +24,7 @@ datamarket/utils/strings/normalization.py,sha256=QLZ-THzjGOK9eWPPR1PrsffwQkSOx_M
24
24
  datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
25
25
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
26
26
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
27
- datamarket-0.9.39.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
28
- datamarket-0.9.39.dist-info/METADATA,sha256=hSvX9DNsR_mJwh_Wcx1HbU5o3LTK2nGctn-kzQ-ZERo,6961
29
- datamarket-0.9.39.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
30
- datamarket-0.9.39.dist-info/RECORD,,
27
+ datamarket-0.9.40.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
28
+ datamarket-0.9.40.dist-info/METADATA,sha256=odG5B_7jOuXZFBNe260K0HP7vkDntMf8yAUOgSxNOws,6961
29
+ datamarket-0.9.40.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
30
+ datamarket-0.9.40.dist-info/RECORD,,