datamarket 0.9.40__py3-none-any.whl → 0.9.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/exceptions/__init__.py +1 -0
- datamarket/exceptions/main.py +14 -0
- datamarket/utils/__init__.py +1 -1
- datamarket/utils/main.py +79 -5
- {datamarket-0.9.40.dist-info → datamarket-0.9.42.dist-info}/METADATA +5 -1
- {datamarket-0.9.40.dist-info → datamarket-0.9.42.dist-info}/RECORD +8 -6
- {datamarket-0.9.40.dist-info → datamarket-0.9.42.dist-info}/LICENSE +0 -0
- {datamarket-0.9.40.dist-info → datamarket-0.9.42.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .main import * # noqa: F403
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# CLASSES
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RedirectionDetectedError(Exception):
|
|
6
|
+
def __init__(self, message="Redirection detected!"):
|
|
7
|
+
self.message = message
|
|
8
|
+
super().__init__(self.message)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class NotFoundError(Exception):
|
|
12
|
+
def __init__(self, message="Not found!"):
|
|
13
|
+
self.message = message
|
|
14
|
+
super().__init__(self.message)
|
datamarket/utils/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
from .main import *
|
|
1
|
+
from .main import * # noqa: F403
|
datamarket/utils/main.py
CHANGED
|
@@ -13,8 +13,26 @@ from pathlib import Path
|
|
|
13
13
|
from typing import Any, Literal, Self, Union
|
|
14
14
|
|
|
15
15
|
import pendulum
|
|
16
|
+
import requests
|
|
17
|
+
from bs4 import BeautifulSoup
|
|
16
18
|
from croniter import croniter
|
|
17
19
|
from dynaconf import Dynaconf, add_converter
|
|
20
|
+
from requests.exceptions import ProxyError
|
|
21
|
+
from tenacity import (
|
|
22
|
+
before_sleep_log,
|
|
23
|
+
retry,
|
|
24
|
+
retry_if_exception_type,
|
|
25
|
+
retry_if_not_exception_type,
|
|
26
|
+
stop_after_attempt,
|
|
27
|
+
stop_after_delay,
|
|
28
|
+
wait_exponential,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
from ..exceptions import NotFoundError, RedirectionDetectedError
|
|
32
|
+
from ..interfaces.proxy import ProxyInterface
|
|
33
|
+
|
|
34
|
+
########################################################################################################################
|
|
35
|
+
# FUNCTIONS
|
|
18
36
|
|
|
19
37
|
logger = logging.getLogger(__name__)
|
|
20
38
|
|
|
@@ -26,9 +44,7 @@ class NoProjectFoundError(Exception):
|
|
|
26
44
|
|
|
27
45
|
class NoPackageFoundError(Exception):
|
|
28
46
|
def __init__(self):
|
|
29
|
-
super().__init__(
|
|
30
|
-
"A project was detected but it has no packages inside the 'src' directory"
|
|
31
|
-
)
|
|
47
|
+
super().__init__("A project was detected but it has no packages inside the 'src' directory")
|
|
32
48
|
|
|
33
49
|
|
|
34
50
|
########################################################################################################################
|
|
@@ -84,8 +100,8 @@ class Project:
|
|
|
84
100
|
|
|
85
101
|
try:
|
|
86
102
|
self.pkg_name = next((self.path / "src").glob("*")).name
|
|
87
|
-
except StopIteration:
|
|
88
|
-
raise NoPackageFoundError()
|
|
103
|
+
except StopIteration as e:
|
|
104
|
+
raise NoPackageFoundError() from e
|
|
89
105
|
|
|
90
106
|
self.env_name = f"{self.pkg_name}_env"
|
|
91
107
|
self.config_path = self.path / self.CONFIG_FILE_NAME
|
|
@@ -227,3 +243,61 @@ def parse_field(dict_struct, field_path, format_method=None):
|
|
|
227
243
|
if field_value is None:
|
|
228
244
|
return None
|
|
229
245
|
return format_method(field_value) if format_method else field_value
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@retry(
|
|
249
|
+
retry=retry_if_not_exception_type((NotFoundError, RedirectionDetectedError, ProxyError)),
|
|
250
|
+
wait=wait_exponential(exp_base=3, multiplier=3, max=60),
|
|
251
|
+
stop=stop_after_attempt(5),
|
|
252
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
253
|
+
reraise=True,
|
|
254
|
+
)
|
|
255
|
+
def get_data(
|
|
256
|
+
url: str,
|
|
257
|
+
method: str = "GET",
|
|
258
|
+
output: str = "json",
|
|
259
|
+
sleep: tuple = (6, 3),
|
|
260
|
+
proxy_interface: ProxyInterface = None,
|
|
261
|
+
use_auth_proxies: bool = False,
|
|
262
|
+
max_proxy_delay: int = 1800,
|
|
263
|
+
**kwargs,
|
|
264
|
+
):
|
|
265
|
+
retry_type = retry_if_exception_type(ProxyError)
|
|
266
|
+
wait = wait_exponential(exp_base=3, multiplier=3, max=60)
|
|
267
|
+
stop = stop_after_delay(max_proxy_delay)
|
|
268
|
+
before_sleep = before_sleep_log(logger, logging.WARNING)
|
|
269
|
+
|
|
270
|
+
@retry(retry=retry_type, wait=wait, stop=stop, before_sleep=before_sleep, reraise=True)
|
|
271
|
+
def _fetch_with_proxy_retry(url, method, proxy_interface, use_auth, **params):
|
|
272
|
+
logger.info(f"Fetching data from {url} ...")
|
|
273
|
+
proxy_cfg = None
|
|
274
|
+
if proxy_interface:
|
|
275
|
+
host, port, user, pwd = proxy_interface.get_proxies(raw=True, use_auth=use_auth)
|
|
276
|
+
if host and port:
|
|
277
|
+
proxy_url = f"http://{host}:{port}"
|
|
278
|
+
proxy_auth_url = f"http://{user}:{pwd}@{host}:{port}"
|
|
279
|
+
proxy_cfg = {"http": proxy_url, "https": proxy_url}
|
|
280
|
+
if user and pwd:
|
|
281
|
+
proxy_cfg = {"http": proxy_auth_url, "https": proxy_auth_url}
|
|
282
|
+
logger.info(f"Using proxy: {proxy_url}")
|
|
283
|
+
response = getattr(requests, method.lower())(url, proxies=proxy_cfg, **params)
|
|
284
|
+
return response
|
|
285
|
+
|
|
286
|
+
params = {"timeout": 30} | kwargs
|
|
287
|
+
r = _fetch_with_proxy_retry(url, method, proxy_interface, use_auth_proxies, **params)
|
|
288
|
+
|
|
289
|
+
ban_sleep(*sleep)
|
|
290
|
+
|
|
291
|
+
if r.status_code == 404:
|
|
292
|
+
raise NotFoundError(f"404 Not Found error for {url}")
|
|
293
|
+
r.raise_for_status()
|
|
294
|
+
r.encoding = "utf-8"
|
|
295
|
+
|
|
296
|
+
if output == "json":
|
|
297
|
+
return r.json()
|
|
298
|
+
elif output == "text":
|
|
299
|
+
return r.text
|
|
300
|
+
elif output == "soup":
|
|
301
|
+
return BeautifulSoup(r.content, "html.parser")
|
|
302
|
+
elif output == "response":
|
|
303
|
+
return r
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: datamarket
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.42
|
|
4
4
|
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
5
|
License: GPL-3.0-or-later
|
|
6
6
|
Author: DataMarket
|
|
@@ -48,6 +48,8 @@ Provides-Extra: proxy
|
|
|
48
48
|
Provides-Extra: pyarrow
|
|
49
49
|
Provides-Extra: pydrive2
|
|
50
50
|
Provides-Extra: pymupdf
|
|
51
|
+
Provides-Extra: pyproj
|
|
52
|
+
Provides-Extra: pyrate-limiter
|
|
51
53
|
Provides-Extra: pysocks
|
|
52
54
|
Provides-Extra: pyspark
|
|
53
55
|
Provides-Extra: pytest
|
|
@@ -105,6 +107,8 @@ Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
|
|
|
105
107
|
Requires-Dist: pycountry (>=24.0.0,<25.0.0)
|
|
106
108
|
Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
|
|
107
109
|
Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
|
|
110
|
+
Requires-Dist: pyproj (>=3.0.0,<4.0.0) ; extra == "pyproj"
|
|
111
|
+
Requires-Dist: pyrate-limiter (>=3.0.0,<4.0.0) ; extra == "pyrate-limiter"
|
|
108
112
|
Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
|
|
109
113
|
Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
|
|
110
114
|
Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
datamarket/__init__.py,sha256=FHS77P9qNewKMoN-p0FLEUEC60oWIYup1QkbJZP4ays,12
|
|
2
|
+
datamarket/exceptions/__init__.py,sha256=-Vu-RZNKjW6fYCLqbUJTkKNuHeA8Yi_gyR50oZNaA_8,33
|
|
3
|
+
datamarket/exceptions/main.py,sha256=MP5ql6M7DoMbBf-Dg_2ohcUFdWXgzv-dXHntPPit31s,453
|
|
2
4
|
datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
5
|
datamarket/interfaces/alchemy.py,sha256=mQwjDqBpz1QHRV2JTCALvn5iK_ky69oE2Gw-EtRXsqQ,14664
|
|
4
6
|
datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
|
|
@@ -10,10 +12,10 @@ datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5
|
|
|
10
12
|
datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
|
|
11
13
|
datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
14
|
datamarket/params/nominatim.py,sha256=XrCru3yEbs-X3ueOaCeSTBZwi4CWHW7oNhEyexBleMw,1184
|
|
13
|
-
datamarket/utils/__init__.py,sha256=
|
|
15
|
+
datamarket/utils/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
|
|
14
16
|
datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
|
|
15
17
|
datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
|
|
16
|
-
datamarket/utils/main.py,sha256=
|
|
18
|
+
datamarket/utils/main.py,sha256=DMMgkQnMS6fNziTru8FM9z2ERfYfkdR9qFPF7s6sp7U,9657
|
|
17
19
|
datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
20
|
datamarket/utils/playwright/async_api.py,sha256=pWfVj-ItfIeZBxG7WiUHKSeZDcHQFUQ5mrNGyIh1IdA,883
|
|
19
21
|
datamarket/utils/playwright/sync_api.py,sha256=lIGm8mLxhFg04LVNdF8SO_9yjOLBnWe5pPry6ZFMnIg,846
|
|
@@ -24,7 +26,7 @@ datamarket/utils/strings/normalization.py,sha256=QLZ-THzjGOK9eWPPR1PrsffwQkSOx_M
|
|
|
24
26
|
datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
|
|
25
27
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
26
28
|
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
27
|
-
datamarket-0.9.
|
|
28
|
-
datamarket-0.9.
|
|
29
|
-
datamarket-0.9.
|
|
30
|
-
datamarket-0.9.
|
|
29
|
+
datamarket-0.9.42.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
30
|
+
datamarket-0.9.42.dist-info/METADATA,sha256=psplepRhULPjVtWuUwxCUEcK3mrS923oWPXPh8wPnlw,7149
|
|
31
|
+
datamarket-0.9.42.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
32
|
+
datamarket-0.9.42.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|