datamarket 0.9.39__py3-none-any.whl → 0.9.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -0,0 +1 @@
1
+ from .main import * # noqa: F403
@@ -0,0 +1,14 @@
1
+ ########################################################################################################################
2
+ # CLASSES
3
+
4
+
5
+ class RedirectionDetectedError(Exception):
6
+ def __init__(self, message="Redirection detected!"):
7
+ self.message = message
8
+ super().__init__(self.message)
9
+
10
+
11
+ class NotFoundError(Exception):
12
+ def __init__(self, message="Not found!"):
13
+ self.message = message
14
+ super().__init__(self.message)
@@ -17,6 +17,7 @@ from ..utils.strings import normalize
17
17
  # PARAMETERS
18
18
 
19
19
  JARO_WINKLER_THRESHOLD = 0.85
20
+ CLOSE_KM = 2.0
20
21
 
21
22
  ########################################################################################################################
22
23
  # CLASSES
@@ -145,55 +146,86 @@ class Nominatim:
145
146
  parsed_nominatim_result: Dict[str, Optional[str]],
146
147
  parsed_geonames_result: Dict[str, Optional[str]],
147
148
  nominatim_address_province_raw: Optional[str],
148
- ) -> Tuple[Optional[str], Optional[str]]:
149
+ dist_nominatim: float, # distance Nominatim ↔ input (km)
150
+ dist_geonames: float, # distance GeoNames ↔ input (km)
151
+ ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
149
152
  """
150
- Determines the postcode and its derived province based on comparisons
151
- between Nominatim and GeoNames data, and Nominatim's raw address province.
153
+ Decide the authoritative postcode, the province derived from it and the associated state.
154
+
155
+ Strategy:
156
+ 1. Derive province from each postcode.
157
+ 2. Validate each postcode–province pair:
158
+ • Nominatim: compare with raw province string (if present).
159
+ • GeoNames: multi-step validation (raw province, then Nominatim-derived
160
+ province when Nominatim coords are close, then distance fallback).
161
+ 3. Return the postcode/province that passes validation with precedence:
162
+ Nominatim > GeoNames. Returns (None, None, None) if neither passes.
152
163
  """
164
+
165
+ # --- Extract postcodes ---
153
166
  nominatim_postcode = parsed_nominatim_result.get("postcode")
154
167
  geonames_postcode = parsed_geonames_result.get("postcode")
155
168
 
156
- province_from_nominatim_postcode = self.geonames.get_province_from_postcode(nominatim_postcode)
157
- province_from_geonames_postcode = self.geonames.get_province_from_postcode(geonames_postcode)
169
+ # --- Province derived from each postcode ---
170
+ province_from_nominatim_pc = self.geonames.get_province_from_postcode(nominatim_postcode)
171
+ province_from_geonames_pc = self.geonames.get_province_from_postcode(geonames_postcode)
158
172
 
159
- norm_raw_nominatim_province = (
160
- normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
161
- )
162
- norm_province_from_nominatim_postcode = (
163
- normalize(province_from_nominatim_postcode) if province_from_nominatim_postcode else ""
164
- )
165
- norm_province_from_geonames_postcode = (
166
- normalize(province_from_geonames_postcode) if province_from_geonames_postcode else ""
167
- )
173
+ # --- Normalised strings for similarity comparisons ---
174
+ norm_raw_province = normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
175
+ norm_province_from_nominatim_pc = normalize(province_from_nominatim_pc) if province_from_nominatim_pc else ""
176
+ norm_province_from_geonames_pc = normalize(province_from_geonames_pc) if province_from_geonames_pc else ""
168
177
 
169
- selected_postcode = None
170
- selected_province_from_postcode = None
178
+ # --- Distance heuristics ---
179
+ nominatim_is_close = dist_nominatim < CLOSE_KM
180
+ geonames_is_close = dist_geonames < CLOSE_KM
171
181
 
172
- # If provinces derived from Nominatim and GeoNames postcodes differ
173
- nominatim_postcode_province_matches = False
174
- if norm_province_from_nominatim_postcode and norm_raw_nominatim_province:
175
- nominatim_postcode_province_matches = (
176
- jaro_winkler_similarity(norm_province_from_nominatim_postcode, norm_raw_nominatim_province)
177
- > JARO_WINKLER_THRESHOLD
182
+ # --- Validate Nominatim postcode ---
183
+ nominatim_pc_valid = False
184
+ if norm_province_from_nominatim_pc and norm_raw_province:
185
+ nominatim_pc_valid = (
186
+ jaro_winkler_similarity(norm_province_from_nominatim_pc, norm_raw_province) > JARO_WINKLER_THRESHOLD
178
187
  )
179
188
 
180
- geonames_postcode_province_matches = False
181
- if norm_province_from_geonames_postcode and norm_raw_nominatim_province:
182
- geonames_postcode_province_matches = (
183
- jaro_winkler_similarity(norm_province_from_geonames_postcode, norm_raw_nominatim_province)
184
- > JARO_WINKLER_THRESHOLD
185
- )
189
+ # --- Validate GeoNames postcode ---
190
+ geonames_pc_valid = False
186
191
 
187
- # Prefer GeoNames postcode if its province matches Nominatim's raw address province,
188
- # and Nominatim's own postcode-derived province does not.
189
- if nominatim_postcode_province_matches:
190
- selected_postcode = nominatim_postcode
191
- selected_province_from_postcode = province_from_nominatim_postcode
192
- if geonames_postcode_province_matches and not nominatim_postcode_province_matches:
193
- selected_postcode = geonames_postcode
194
- selected_province_from_postcode = province_from_geonames_postcode
192
+ # 1) Compare with raw province string (if exists)
193
+ if norm_province_from_geonames_pc and norm_raw_province:
194
+ geonames_pc_valid = (
195
+ jaro_winkler_similarity(norm_province_from_geonames_pc, norm_raw_province) > JARO_WINKLER_THRESHOLD
196
+ )
195
197
 
196
- return selected_postcode, selected_province_from_postcode
198
+ # 2) If no raw province, compare with province from Nominatim PC **only when** Nominatim is close
199
+ if not geonames_pc_valid and not norm_raw_province and nominatim_is_close:
200
+ if norm_province_from_geonames_pc and norm_province_from_nominatim_pc:
201
+ geonames_pc_valid = (
202
+ jaro_winkler_similarity(norm_province_from_geonames_pc, norm_province_from_nominatim_pc)
203
+ > JARO_WINKLER_THRESHOLD
204
+ )
205
+
206
+ # 3) Fallback: accept GeoNames PC if its coordinates are very close
207
+ if not geonames_pc_valid and geonames_is_close and geonames_postcode:
208
+ geonames_pc_valid = True
209
+
210
+ # --- Select authoritative tuple ---
211
+ postcode = None
212
+ province = None
213
+ state = None
214
+
215
+ if nominatim_pc_valid:
216
+ postcode = nominatim_postcode
217
+ province = province_from_nominatim_pc
218
+ state = parsed_nominatim_result.get("state")
219
+ if not state and geonames_pc_valid:
220
+ state = parsed_geonames_result.get("state")
221
+ elif geonames_pc_valid:
222
+ postcode = geonames_postcode
223
+ province = province_from_geonames_pc
224
+ state = parsed_geonames_result.get("state")
225
+ if not state and nominatim_pc_valid:
226
+ state = parsed_nominatim_result.get("state")
227
+
228
+ return postcode, province, state
197
229
 
198
230
  def _select_final_result(
199
231
  self,
@@ -203,21 +235,41 @@ class Nominatim:
203
235
  dist_geonames: float,
204
236
  authoritative_postcode: Optional[str],
205
237
  authoritative_province_from_postcode: Optional[str],
206
- nominatim_province: Optional[str],
238
+ authoritative_state: Optional[str],
207
239
  ) -> Dict[str, Optional[str]]:
208
240
  """
209
- Selects the final address result based on distances and applies the authoritative postcode/province.
241
+ Choose the address block (Nominatim vs GeoNames) based on distance,
242
+ then apply the authoritative postcode/province.
243
+
244
+ Rules:
245
+ • Pick the source with the smaller finite distance.
246
+ • Always overwrite 'postcode' if authoritative_postcode is present.
247
+ • Overwrite 'province' only when authoritative_province_from_postcode is not None.
248
+ • If both distances are ∞, return an empty address.
210
249
  """
250
+
251
+ # ------------------------------------------------------------------ #
252
+ # 1. Decide the base address block #
253
+ # ------------------------------------------------------------------ #
211
254
  if dist_nominatim <= dist_geonames and dist_nominatim != float("inf"):
212
255
  final_result = parsed_nominatim_result
213
- final_result["postcode"] = authoritative_postcode
214
- final_result["province"] = nominatim_province
215
256
  elif dist_geonames < dist_nominatim and dist_geonames != float("inf"):
216
257
  final_result = parsed_geonames_result
258
+ else:
259
+ return self._get_empty_address_result()
260
+
261
+ # ------------------------------------------------------------------ #
262
+ # 2. Apply authoritative postcode / province #
263
+ # ------------------------------------------------------------------ #
264
+ if authoritative_postcode:
217
265
  final_result["postcode"] = authoritative_postcode
266
+
267
+ if authoritative_province_from_postcode:
218
268
  final_result["province"] = authoritative_province_from_postcode
219
- else:
220
- final_result = self._get_empty_address_result()
269
+
270
+ if authoritative_province_from_postcode:
271
+ final_result["state"] = authoritative_state
272
+
221
273
  return final_result
222
274
 
223
275
  def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
@@ -250,12 +302,6 @@ class Nominatim:
250
302
  parsed_nominatim_result = self._parse_nominatim_result(nominatim_response)
251
303
  parsed_geonames_result = self._parse_geonames_result(geonames_response)
252
304
 
253
- # Determine authoritative postcode
254
- nominatim_province = parsed_nominatim_result.get("province")
255
- selected_postcode, selected_province_from_postcode = self._select_postcode_and_derived_province(
256
- parsed_nominatim_result, parsed_geonames_result, nominatim_province
257
- )
258
-
259
305
  # Calculate distances
260
306
  nominatim_response_lat = nominatim_response.get("lat")
261
307
  nominatim_response_lon = nominatim_response.get("lon")
@@ -272,6 +318,12 @@ class Nominatim:
272
318
  dist_nominatim = self._calculate_distance(nominatim_response_lat, nominatim_response_lon, input_coords)
273
319
  dist_geonames = self._calculate_distance(geonames_response_lat, geonames_response_lon, input_coords)
274
320
 
321
+ # Determine authoritative postcode
322
+ nominatim_province = parsed_nominatim_result.get("province")
323
+ selected_postcode, selected_province_from_postcode, selected_state = self._select_postcode_and_derived_province(
324
+ parsed_nominatim_result, parsed_geonames_result, nominatim_province, dist_nominatim, dist_geonames
325
+ )
326
+
275
327
  # Select final result
276
328
  final_result = self._select_final_result(
277
329
  parsed_nominatim_result,
@@ -280,7 +332,7 @@ class Nominatim:
280
332
  dist_geonames,
281
333
  selected_postcode,
282
334
  selected_province_from_postcode,
283
- nominatim_province,
335
+ selected_state,
284
336
  )
285
337
 
286
338
  return final_result
@@ -1 +1 @@
1
- from .main import *
1
+ from .main import * # noqa: F403
datamarket/utils/main.py CHANGED
@@ -13,8 +13,26 @@ from pathlib import Path
13
13
  from typing import Any, Literal, Self, Union
14
14
 
15
15
  import pendulum
16
+ import requests
17
+ from bs4 import BeautifulSoup
16
18
  from croniter import croniter
17
19
  from dynaconf import Dynaconf, add_converter
20
+ from requests.exceptions import ProxyError
21
+ from tenacity import (
22
+ before_sleep_log,
23
+ retry,
24
+ retry_if_exception_type,
25
+ retry_if_not_exception_type,
26
+ stop_after_attempt,
27
+ stop_after_delay,
28
+ wait_exponential,
29
+ )
30
+
31
+ from ..exceptions import NotFoundError, RedirectionDetectedError
32
+ from ..interfaces.proxy import ProxyInterface
33
+
34
+ ########################################################################################################################
35
+ # FUNCTIONS
18
36
 
19
37
  logger = logging.getLogger(__name__)
20
38
 
@@ -26,9 +44,7 @@ class NoProjectFoundError(Exception):
26
44
 
27
45
  class NoPackageFoundError(Exception):
28
46
  def __init__(self):
29
- super().__init__(
30
- "A project was detected but it has no packages inside the 'src' directory"
31
- )
47
+ super().__init__("A project was detected but it has no packages inside the 'src' directory")
32
48
 
33
49
 
34
50
  ########################################################################################################################
@@ -84,8 +100,8 @@ class Project:
84
100
 
85
101
  try:
86
102
  self.pkg_name = next((self.path / "src").glob("*")).name
87
- except StopIteration:
88
- raise NoPackageFoundError()
103
+ except StopIteration as e:
104
+ raise NoPackageFoundError() from e
89
105
 
90
106
  self.env_name = f"{self.pkg_name}_env"
91
107
  self.config_path = self.path / self.CONFIG_FILE_NAME
@@ -227,3 +243,61 @@ def parse_field(dict_struct, field_path, format_method=None):
227
243
  if field_value is None:
228
244
  return None
229
245
  return format_method(field_value) if format_method else field_value
246
+
247
+
248
+ @retry(
249
+ retry=retry_if_not_exception_type((NotFoundError, RedirectionDetectedError, ProxyError)),
250
+ wait=wait_exponential(exp_base=3, multiplier=3, max=60),
251
+ stop=stop_after_attempt(5),
252
+ before_sleep=before_sleep_log(logger, logging.WARNING),
253
+ reraise=True,
254
+ )
255
+ def get_data(
256
+ url: str,
257
+ method: str = "GET",
258
+ output: str = "json",
259
+ sleep: tuple = (6, 3),
260
+ proxy_interface: ProxyInterface = None,
261
+ use_auth_proxies: bool = False,
262
+ max_proxy_delay: int = 1800,
263
+ **kwargs,
264
+ ):
265
+ retry_type = retry_if_exception_type(ProxyError)
266
+ wait = wait_exponential(exp_base=3, multiplier=3, max=60)
267
+ stop = stop_after_delay(max_proxy_delay)
268
+ before_sleep = before_sleep_log(logger, logging.WARNING)
269
+
270
+ @retry(retry=retry_type, wait=wait, stop=stop, before_sleep=before_sleep, reraise=True)
271
+ def _fetch_with_proxy_retry(url, method, proxy_interface, use_auth, **params):
272
+ logger.info(f"Fetching data from {url} ...")
273
+ proxy_cfg = None
274
+ if proxy_interface:
275
+ host, port, user, pwd = proxy_interface.get_proxies(raw=True, use_auth=use_auth)
276
+ if host and port:
277
+ proxy_url = f"http://{host}:{port}"
278
+ proxy_auth_url = f"http://{user}:{pwd}@{host}:{port}"
279
+ proxy_cfg = {"http": proxy_url, "https": proxy_url}
280
+ if user and pwd:
281
+ proxy_cfg = {"http": proxy_auth_url, "https": proxy_auth_url}
282
+ logger.info(f"Using proxy: {proxy_url}")
283
+ response = getattr(requests, method.lower())(url, proxies=proxy_cfg, **params)
284
+ return response
285
+
286
+ params = {"timeout": 30} | kwargs
287
+ r = _fetch_with_proxy_retry(url, method, proxy_interface, use_auth_proxies, **params)
288
+
289
+ ban_sleep(*sleep)
290
+
291
+ if r.status_code == 404:
292
+ raise NotFoundError(f"404 Not Found error for {url}")
293
+ r.raise_for_status()
294
+ r.encoding = "utf-8"
295
+
296
+ if output == "json":
297
+ return r.json()
298
+ elif output == "text":
299
+ return r.text
300
+ elif output == "soup":
301
+ return BeautifulSoup(r.content, "html.parser")
302
+ elif output == "response":
303
+ return r
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.39
3
+ Version: 0.9.41
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -48,6 +48,7 @@ Provides-Extra: proxy
48
48
  Provides-Extra: pyarrow
49
49
  Provides-Extra: pydrive2
50
50
  Provides-Extra: pymupdf
51
+ Provides-Extra: pyrate-limiter
51
52
  Provides-Extra: pysocks
52
53
  Provides-Extra: pyspark
53
54
  Provides-Extra: pytest
@@ -105,6 +106,7 @@ Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
105
106
  Requires-Dist: pycountry (>=24.0.0,<25.0.0)
106
107
  Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
107
108
  Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
109
+ Requires-Dist: pyrate-limiter (>=3.0.0,<4.0.0) ; extra == "pyrate-limiter"
108
110
  Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
109
111
  Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
110
112
  Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
@@ -1,19 +1,21 @@
1
1
  datamarket/__init__.py,sha256=FHS77P9qNewKMoN-p0FLEUEC60oWIYup1QkbJZP4ays,12
2
+ datamarket/exceptions/__init__.py,sha256=-Vu-RZNKjW6fYCLqbUJTkKNuHeA8Yi_gyR50oZNaA_8,33
3
+ datamarket/exceptions/main.py,sha256=MP5ql6M7DoMbBf-Dg_2ohcUFdWXgzv-dXHntPPit31s,453
2
4
  datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
5
  datamarket/interfaces/alchemy.py,sha256=mQwjDqBpz1QHRV2JTCALvn5iK_ky69oE2Gw-EtRXsqQ,14664
4
6
  datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
5
7
  datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
8
  datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
7
- datamarket/interfaces/nominatim.py,sha256=XzqBFVBzGU2BIFnFueZ56tk4JhaQgj5dvFalnCG6Zxk,12417
9
+ datamarket/interfaces/nominatim.py,sha256=d_KIrgzTusVYnw0Fk3YWCjrzlT9sI_bObGG-wOr__as,14726
8
10
  datamarket/interfaces/peerdb.py,sha256=cwYwvO740GyaPo9zLAwJsf3UeJDGDiYzjQVM9Q6s-_g,23652
9
11
  datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
10
12
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
11
13
  datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
14
  datamarket/params/nominatim.py,sha256=XrCru3yEbs-X3ueOaCeSTBZwi4CWHW7oNhEyexBleMw,1184
13
- datamarket/utils/__init__.py,sha256=8D5a8oKgqd6WA1RUkiKCn4l_PVemtyuckxQut0vDHXM,20
15
+ datamarket/utils/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
14
16
  datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
15
17
  datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
16
- datamarket/utils/main.py,sha256=j8wnAxeLvijdRU9M4V6HunWH7vgWWHP4u4xamzkWcUU,7009
18
+ datamarket/utils/main.py,sha256=DMMgkQnMS6fNziTru8FM9z2ERfYfkdR9qFPF7s6sp7U,9657
17
19
  datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
20
  datamarket/utils/playwright/async_api.py,sha256=pWfVj-ItfIeZBxG7WiUHKSeZDcHQFUQ5mrNGyIh1IdA,883
19
21
  datamarket/utils/playwright/sync_api.py,sha256=lIGm8mLxhFg04LVNdF8SO_9yjOLBnWe5pPry6ZFMnIg,846
@@ -24,7 +26,7 @@ datamarket/utils/strings/normalization.py,sha256=QLZ-THzjGOK9eWPPR1PrsffwQkSOx_M
24
26
  datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
25
27
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
26
28
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
27
- datamarket-0.9.39.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
28
- datamarket-0.9.39.dist-info/METADATA,sha256=hSvX9DNsR_mJwh_Wcx1HbU5o3LTK2nGctn-kzQ-ZERo,6961
29
- datamarket-0.9.39.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
30
- datamarket-0.9.39.dist-info/RECORD,,
29
+ datamarket-0.9.41.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
30
+ datamarket-0.9.41.dist-info/METADATA,sha256=uTscsuH3qLNTIPoSBhg07DsWqGhUKcjPMtOknw8vn2A,7067
31
+ datamarket-0.9.41.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
32
+ datamarket-0.9.41.dist-info/RECORD,,