datamarket 0.9.40__tar.gz → 0.9.41__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (33) hide show
  1. {datamarket-0.9.40 → datamarket-0.9.41}/PKG-INFO +3 -1
  2. {datamarket-0.9.40 → datamarket-0.9.41}/pyproject.toml +3 -1
  3. datamarket-0.9.41/src/datamarket/exceptions/__init__.py +1 -0
  4. datamarket-0.9.41/src/datamarket/exceptions/main.py +14 -0
  5. datamarket-0.9.41/src/datamarket/utils/__init__.py +1 -0
  6. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/main.py +79 -5
  7. datamarket-0.9.40/src/datamarket/utils/__init__.py +0 -1
  8. {datamarket-0.9.40 → datamarket-0.9.41}/LICENSE +0 -0
  9. {datamarket-0.9.40 → datamarket-0.9.41}/README.md +0 -0
  10. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/__init__.py +0 -0
  11. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/interfaces/__init__.py +0 -0
  12. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/interfaces/alchemy.py +0 -0
  13. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/interfaces/aws.py +0 -0
  14. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/interfaces/drive.py +0 -0
  15. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/interfaces/ftp.py +0 -0
  16. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/interfaces/nominatim.py +0 -0
  17. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/interfaces/peerdb.py +0 -0
  18. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/interfaces/proxy.py +0 -0
  19. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/interfaces/tinybird.py +0 -0
  20. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/params/__init__.py +0 -0
  21. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/params/nominatim.py +0 -0
  22. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/airflow.py +0 -0
  23. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/alchemy.py +0 -0
  24. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/playwright/__init__.py +0 -0
  25. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/playwright/async_api.py +0 -0
  26. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/playwright/sync_api.py +0 -0
  27. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/selenium.py +0 -0
  28. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/soda.py +0 -0
  29. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/strings/__init__.py +0 -0
  30. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/strings/normalization.py +0 -0
  31. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/strings/obfuscation.py +0 -0
  32. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/typer.py +0 -0
  33. {datamarket-0.9.40 → datamarket-0.9.41}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.40
3
+ Version: 0.9.41
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -48,6 +48,7 @@ Provides-Extra: proxy
48
48
  Provides-Extra: pyarrow
49
49
  Provides-Extra: pydrive2
50
50
  Provides-Extra: pymupdf
51
+ Provides-Extra: pyrate-limiter
51
52
  Provides-Extra: pysocks
52
53
  Provides-Extra: pyspark
53
54
  Provides-Extra: pytest
@@ -105,6 +106,7 @@ Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
105
106
  Requires-Dist: pycountry (>=24.0.0,<25.0.0)
106
107
  Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
107
108
  Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
109
+ Requires-Dist: pyrate-limiter (>=3.0.0,<4.0.0) ; extra == "pyrate-limiter"
108
110
  Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
109
111
  Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
110
112
  Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.40"
3
+ version = "0.9.41"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -83,6 +83,7 @@ presidio-anonymizer = { version = "^2.0.0", optional = true }
83
83
  spacy = { version = "^3.0.0", optional = true }
84
84
  spacy-langdetect = { version = "~0.1.0", optional = true }
85
85
  pandarallel = { version = "^1.0.0", optional = true }
86
+ pyrate-limiter = { version = "^3.0.0", optional = true }
86
87
 
87
88
  [tool.poetry.extras]
88
89
  boto3 = ["boto3"]
@@ -128,6 +129,7 @@ openpyxl = ["openpyxl"]
128
129
  httpx = ["httpx"]
129
130
  camoufox = ["camoufox", "browserforge", "playwright"]
130
131
  pandarallel = ["pandarallel"]
132
+ pyrate-limiter = ["pyrate-limiter"]
131
133
 
132
134
  # Interface groups
133
135
  aws = ["boto3"]
@@ -0,0 +1 @@
1
+ from .main import * # noqa: F403
@@ -0,0 +1,14 @@
1
+ ########################################################################################################################
2
+ # CLASSES
3
+
4
+
5
+ class RedirectionDetectedError(Exception):
6
+ def __init__(self, message="Redirection detected!"):
7
+ self.message = message
8
+ super().__init__(self.message)
9
+
10
+
11
+ class NotFoundError(Exception):
12
+ def __init__(self, message="Not found!"):
13
+ self.message = message
14
+ super().__init__(self.message)
@@ -0,0 +1 @@
1
+ from .main import * # noqa: F403
@@ -13,8 +13,26 @@ from pathlib import Path
13
13
  from typing import Any, Literal, Self, Union
14
14
 
15
15
  import pendulum
16
+ import requests
17
+ from bs4 import BeautifulSoup
16
18
  from croniter import croniter
17
19
  from dynaconf import Dynaconf, add_converter
20
+ from requests.exceptions import ProxyError
21
+ from tenacity import (
22
+ before_sleep_log,
23
+ retry,
24
+ retry_if_exception_type,
25
+ retry_if_not_exception_type,
26
+ stop_after_attempt,
27
+ stop_after_delay,
28
+ wait_exponential,
29
+ )
30
+
31
+ from ..exceptions import NotFoundError, RedirectionDetectedError
32
+ from ..interfaces.proxy import ProxyInterface
33
+
34
+ ########################################################################################################################
35
+ # FUNCTIONS
18
36
 
19
37
  logger = logging.getLogger(__name__)
20
38
 
@@ -26,9 +44,7 @@ class NoProjectFoundError(Exception):
26
44
 
27
45
  class NoPackageFoundError(Exception):
28
46
  def __init__(self):
29
- super().__init__(
30
- "A project was detected but it has no packages inside the 'src' directory"
31
- )
47
+ super().__init__("A project was detected but it has no packages inside the 'src' directory")
32
48
 
33
49
 
34
50
  ########################################################################################################################
@@ -84,8 +100,8 @@ class Project:
84
100
 
85
101
  try:
86
102
  self.pkg_name = next((self.path / "src").glob("*")).name
87
- except StopIteration:
88
- raise NoPackageFoundError()
103
+ except StopIteration as e:
104
+ raise NoPackageFoundError() from e
89
105
 
90
106
  self.env_name = f"{self.pkg_name}_env"
91
107
  self.config_path = self.path / self.CONFIG_FILE_NAME
@@ -227,3 +243,61 @@ def parse_field(dict_struct, field_path, format_method=None):
227
243
  if field_value is None:
228
244
  return None
229
245
  return format_method(field_value) if format_method else field_value
246
+
247
+
248
+ @retry(
249
+ retry=retry_if_not_exception_type((NotFoundError, RedirectionDetectedError, ProxyError)),
250
+ wait=wait_exponential(exp_base=3, multiplier=3, max=60),
251
+ stop=stop_after_attempt(5),
252
+ before_sleep=before_sleep_log(logger, logging.WARNING),
253
+ reraise=True,
254
+ )
255
+ def get_data(
256
+ url: str,
257
+ method: str = "GET",
258
+ output: str = "json",
259
+ sleep: tuple = (6, 3),
260
+ proxy_interface: ProxyInterface = None,
261
+ use_auth_proxies: bool = False,
262
+ max_proxy_delay: int = 1800,
263
+ **kwargs,
264
+ ):
265
+ retry_type = retry_if_exception_type(ProxyError)
266
+ wait = wait_exponential(exp_base=3, multiplier=3, max=60)
267
+ stop = stop_after_delay(max_proxy_delay)
268
+ before_sleep = before_sleep_log(logger, logging.WARNING)
269
+
270
+ @retry(retry=retry_type, wait=wait, stop=stop, before_sleep=before_sleep, reraise=True)
271
+ def _fetch_with_proxy_retry(url, method, proxy_interface, use_auth, **params):
272
+ logger.info(f"Fetching data from {url} ...")
273
+ proxy_cfg = None
274
+ if proxy_interface:
275
+ host, port, user, pwd = proxy_interface.get_proxies(raw=True, use_auth=use_auth)
276
+ if host and port:
277
+ proxy_url = f"http://{host}:{port}"
278
+ proxy_auth_url = f"http://{user}:{pwd}@{host}:{port}"
279
+ proxy_cfg = {"http": proxy_url, "https": proxy_url}
280
+ if user and pwd:
281
+ proxy_cfg = {"http": proxy_auth_url, "https": proxy_auth_url}
282
+ logger.info(f"Using proxy: {proxy_url}")
283
+ response = getattr(requests, method.lower())(url, proxies=proxy_cfg, **params)
284
+ return response
285
+
286
+ params = {"timeout": 30} | kwargs
287
+ r = _fetch_with_proxy_retry(url, method, proxy_interface, use_auth_proxies, **params)
288
+
289
+ ban_sleep(*sleep)
290
+
291
+ if r.status_code == 404:
292
+ raise NotFoundError(f"404 Not Found error for {url}")
293
+ r.raise_for_status()
294
+ r.encoding = "utf-8"
295
+
296
+ if output == "json":
297
+ return r.json()
298
+ elif output == "text":
299
+ return r.text
300
+ elif output == "soup":
301
+ return BeautifulSoup(r.content, "html.parser")
302
+ elif output == "response":
303
+ return r
@@ -1 +0,0 @@
1
- from .main import *
File without changes
File without changes