datamarket 0.9.40__py3-none-any.whl → 0.9.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -0,0 +1 @@
1
+ from .main import * # noqa: F403
@@ -0,0 +1,14 @@
1
+ ########################################################################################################################
2
+ # CLASSES
3
+
4
+
5
+ class RedirectionDetectedError(Exception):
6
+ def __init__(self, message="Redirection detected!"):
7
+ self.message = message
8
+ super().__init__(self.message)
9
+
10
+
11
+ class NotFoundError(Exception):
12
+ def __init__(self, message="Not found!"):
13
+ self.message = message
14
+ super().__init__(self.message)
@@ -1 +1 @@
1
- from .main import *
1
+ from .main import * # noqa: F403
datamarket/utils/main.py CHANGED
@@ -13,8 +13,26 @@ from pathlib import Path
13
13
  from typing import Any, Literal, Self, Union
14
14
 
15
15
  import pendulum
16
+ import requests
17
+ from bs4 import BeautifulSoup
16
18
  from croniter import croniter
17
19
  from dynaconf import Dynaconf, add_converter
20
+ from requests.exceptions import ProxyError
21
+ from tenacity import (
22
+ before_sleep_log,
23
+ retry,
24
+ retry_if_exception_type,
25
+ retry_if_not_exception_type,
26
+ stop_after_attempt,
27
+ stop_after_delay,
28
+ wait_exponential,
29
+ )
30
+
31
+ from ..exceptions import NotFoundError, RedirectionDetectedError
32
+ from ..interfaces.proxy import ProxyInterface
33
+
34
+ ########################################################################################################################
35
+ # FUNCTIONS
18
36
 
19
37
  logger = logging.getLogger(__name__)
20
38
 
@@ -26,9 +44,7 @@ class NoProjectFoundError(Exception):
26
44
 
27
45
  class NoPackageFoundError(Exception):
28
46
  def __init__(self):
29
- super().__init__(
30
- "A project was detected but it has no packages inside the 'src' directory"
31
- )
47
+ super().__init__("A project was detected but it has no packages inside the 'src' directory")
32
48
 
33
49
 
34
50
  ########################################################################################################################
@@ -84,8 +100,8 @@ class Project:
84
100
 
85
101
  try:
86
102
  self.pkg_name = next((self.path / "src").glob("*")).name
87
- except StopIteration:
88
- raise NoPackageFoundError()
103
+ except StopIteration as e:
104
+ raise NoPackageFoundError() from e
89
105
 
90
106
  self.env_name = f"{self.pkg_name}_env"
91
107
  self.config_path = self.path / self.CONFIG_FILE_NAME
@@ -227,3 +243,61 @@ def parse_field(dict_struct, field_path, format_method=None):
227
243
  if field_value is None:
228
244
  return None
229
245
  return format_method(field_value) if format_method else field_value
246
+
247
+
248
+ @retry(
249
+ retry=retry_if_not_exception_type((NotFoundError, RedirectionDetectedError, ProxyError)),
250
+ wait=wait_exponential(exp_base=3, multiplier=3, max=60),
251
+ stop=stop_after_attempt(5),
252
+ before_sleep=before_sleep_log(logger, logging.WARNING),
253
+ reraise=True,
254
+ )
255
+ def get_data(
256
+ url: str,
257
+ method: str = "GET",
258
+ output: str = "json",
259
+ sleep: tuple = (6, 3),
260
+ proxy_interface: ProxyInterface = None,
261
+ use_auth_proxies: bool = False,
262
+ max_proxy_delay: int = 1800,
263
+ **kwargs,
264
+ ):
265
+ retry_type = retry_if_exception_type(ProxyError)
266
+ wait = wait_exponential(exp_base=3, multiplier=3, max=60)
267
+ stop = stop_after_delay(max_proxy_delay)
268
+ before_sleep = before_sleep_log(logger, logging.WARNING)
269
+
270
+ @retry(retry=retry_type, wait=wait, stop=stop, before_sleep=before_sleep, reraise=True)
271
+ def _fetch_with_proxy_retry(url, method, proxy_interface, use_auth, **params):
272
+ logger.info(f"Fetching data from {url} ...")
273
+ proxy_cfg = None
274
+ if proxy_interface:
275
+ host, port, user, pwd = proxy_interface.get_proxies(raw=True, use_auth=use_auth)
276
+ if host and port:
277
+ proxy_url = f"http://{host}:{port}"
278
+ proxy_auth_url = f"http://{user}:{pwd}@{host}:{port}"
279
+ proxy_cfg = {"http": proxy_url, "https": proxy_url}
280
+ if user and pwd:
281
+ proxy_cfg = {"http": proxy_auth_url, "https": proxy_auth_url}
282
+ logger.info(f"Using proxy: {proxy_url}")
283
+ response = getattr(requests, method.lower())(url, proxies=proxy_cfg, **params)
284
+ return response
285
+
286
+ params = {"timeout": 30} | kwargs
287
+ r = _fetch_with_proxy_retry(url, method, proxy_interface, use_auth_proxies, **params)
288
+
289
+ ban_sleep(*sleep)
290
+
291
+ if r.status_code == 404:
292
+ raise NotFoundError(f"404 Not Found error for {url}")
293
+ r.raise_for_status()
294
+ r.encoding = "utf-8"
295
+
296
+ if output == "json":
297
+ return r.json()
298
+ elif output == "text":
299
+ return r.text
300
+ elif output == "soup":
301
+ return BeautifulSoup(r.content, "html.parser")
302
+ elif output == "response":
303
+ return r
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.40
3
+ Version: 0.9.42
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -48,6 +48,8 @@ Provides-Extra: proxy
48
48
  Provides-Extra: pyarrow
49
49
  Provides-Extra: pydrive2
50
50
  Provides-Extra: pymupdf
51
+ Provides-Extra: pyproj
52
+ Provides-Extra: pyrate-limiter
51
53
  Provides-Extra: pysocks
52
54
  Provides-Extra: pyspark
53
55
  Provides-Extra: pytest
@@ -105,6 +107,8 @@ Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
105
107
  Requires-Dist: pycountry (>=24.0.0,<25.0.0)
106
108
  Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
107
109
  Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
110
+ Requires-Dist: pyproj (>=3.0.0,<4.0.0) ; extra == "pyproj"
111
+ Requires-Dist: pyrate-limiter (>=3.0.0,<4.0.0) ; extra == "pyrate-limiter"
108
112
  Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
109
113
  Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
110
114
  Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
@@ -1,4 +1,6 @@
1
1
  datamarket/__init__.py,sha256=FHS77P9qNewKMoN-p0FLEUEC60oWIYup1QkbJZP4ays,12
2
+ datamarket/exceptions/__init__.py,sha256=-Vu-RZNKjW6fYCLqbUJTkKNuHeA8Yi_gyR50oZNaA_8,33
3
+ datamarket/exceptions/main.py,sha256=MP5ql6M7DoMbBf-Dg_2ohcUFdWXgzv-dXHntPPit31s,453
2
4
  datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
5
  datamarket/interfaces/alchemy.py,sha256=mQwjDqBpz1QHRV2JTCALvn5iK_ky69oE2Gw-EtRXsqQ,14664
4
6
  datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
@@ -10,10 +12,10 @@ datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5
10
12
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
11
13
  datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
14
  datamarket/params/nominatim.py,sha256=XrCru3yEbs-X3ueOaCeSTBZwi4CWHW7oNhEyexBleMw,1184
13
- datamarket/utils/__init__.py,sha256=8D5a8oKgqd6WA1RUkiKCn4l_PVemtyuckxQut0vDHXM,20
15
+ datamarket/utils/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
14
16
  datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
15
17
  datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
16
- datamarket/utils/main.py,sha256=j8wnAxeLvijdRU9M4V6HunWH7vgWWHP4u4xamzkWcUU,7009
18
+ datamarket/utils/main.py,sha256=DMMgkQnMS6fNziTru8FM9z2ERfYfkdR9qFPF7s6sp7U,9657
17
19
  datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
20
  datamarket/utils/playwright/async_api.py,sha256=pWfVj-ItfIeZBxG7WiUHKSeZDcHQFUQ5mrNGyIh1IdA,883
19
21
  datamarket/utils/playwright/sync_api.py,sha256=lIGm8mLxhFg04LVNdF8SO_9yjOLBnWe5pPry6ZFMnIg,846
@@ -24,7 +26,7 @@ datamarket/utils/strings/normalization.py,sha256=QLZ-THzjGOK9eWPPR1PrsffwQkSOx_M
24
26
  datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
25
27
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
26
28
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
27
- datamarket-0.9.40.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
28
- datamarket-0.9.40.dist-info/METADATA,sha256=odG5B_7jOuXZFBNe260K0HP7vkDntMf8yAUOgSxNOws,6961
29
- datamarket-0.9.40.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
30
- datamarket-0.9.40.dist-info/RECORD,,
29
+ datamarket-0.9.42.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
30
+ datamarket-0.9.42.dist-info/METADATA,sha256=psplepRhULPjVtWuUwxCUEcK3mrS923oWPXPh8wPnlw,7149
31
+ datamarket-0.9.42.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
32
+ datamarket-0.9.42.dist-info/RECORD,,