datamarket 0.7.100__tar.gz → 0.7.102__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- {datamarket-0.7.100 → datamarket-0.7.102}/PKG-INFO +1 -1
- {datamarket-0.7.100 → datamarket-0.7.102}/pyproject.toml +1 -1
- datamarket-0.7.102/src/datamarket/exceptions/main.py +33 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/main.py +44 -3
- datamarket-0.7.100/src/datamarket/exceptions/main.py +0 -14
- {datamarket-0.7.100 → datamarket-0.7.102}/LICENSE +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/README.md +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/__init__.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/exceptions/__init__.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/interfaces/__init__.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/interfaces/alchemy.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/interfaces/aws.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/interfaces/azure.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/interfaces/drive.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/interfaces/ftp.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/interfaces/nominatim.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/interfaces/peerdb.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/interfaces/proxy.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/interfaces/tinybird.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/params/__init__.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/params/nominatim.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/__init__.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/airflow.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/alchemy.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/nominatim.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/playwright/__init__.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/playwright/async_api.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/playwright/sync_api.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/selenium.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/soda.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/strings/__init__.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/strings/normalization.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/strings/obfuscation.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/strings/standardization.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/typer.py +0 -0
- {datamarket-0.7.100 → datamarket-0.7.102}/src/datamarket/utils/types.py +0 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# CLASSES
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RedirectionDetectedError(Exception):
|
|
9
|
+
def __init__(self, message="Redirection detected!"):
|
|
10
|
+
self.message = message
|
|
11
|
+
super().__init__(self.message)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class NotFoundError(Exception):
|
|
15
|
+
def __init__(self, message="Not found!"):
|
|
16
|
+
self.message = message
|
|
17
|
+
super().__init__(self.message)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BadRequestError(Exception):
|
|
21
|
+
def __init__(self, message="Bad request!"):
|
|
22
|
+
self.message = message
|
|
23
|
+
super().__init__(self.message)
|
|
24
|
+
|
|
25
|
+
class ManagedHTTPError(Exception):
|
|
26
|
+
"""Signal that this HTTP status was handled and should not be retried."""
|
|
27
|
+
def __init__(self, response: requests.Response, *, url: str | None = None, message: str | None = None):
|
|
28
|
+
self.response = response
|
|
29
|
+
self.request = getattr(response, "request", None)
|
|
30
|
+
self.status_code = getattr(response, "status_code", None)
|
|
31
|
+
self.url = url or (self.request.url if self.request is not None else None)
|
|
32
|
+
self.message = message
|
|
33
|
+
super().__init__(message or f"HTTP {self.status_code} for {self.url}")
|
|
@@ -3,12 +3,14 @@
|
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
5
|
import configparser
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
import logging
|
|
7
8
|
import random
|
|
8
9
|
import re
|
|
9
10
|
import shlex
|
|
10
11
|
import subprocess
|
|
11
12
|
import time
|
|
13
|
+
from typing import Sequence
|
|
12
14
|
from babel.numbers import parse_decimal
|
|
13
15
|
|
|
14
16
|
from bs4 import BeautifulSoup
|
|
@@ -25,7 +27,9 @@ from tenacity import (
|
|
|
25
27
|
wait_exponential,
|
|
26
28
|
)
|
|
27
29
|
|
|
28
|
-
from
|
|
30
|
+
from datamarket.exceptions.main import ManagedHTTPError
|
|
31
|
+
|
|
32
|
+
from ..exceptions import RedirectionDetectedError, NotFoundError, BadRequestError
|
|
29
33
|
from ..interfaces.proxy import ProxyInterface
|
|
30
34
|
|
|
31
35
|
########################################################################################################################
|
|
@@ -131,7 +135,9 @@ def parse_field(dict_struct, field_path, format_method=None):
|
|
|
131
135
|
|
|
132
136
|
|
|
133
137
|
@retry(
|
|
134
|
-
retry=retry_if_not_exception_type(
|
|
138
|
+
retry=retry_if_not_exception_type(
|
|
139
|
+
(NotFoundError, BadRequestError, RedirectionDetectedError, ProxyError, ManagedHTTPError)
|
|
140
|
+
),
|
|
135
141
|
wait=wait_exponential(exp_base=3, multiplier=3, max=60),
|
|
136
142
|
stop=stop_after_attempt(5),
|
|
137
143
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
@@ -144,9 +150,38 @@ def get_data(
|
|
|
144
150
|
sleep: tuple = (6, 3),
|
|
145
151
|
proxy_interface: ProxyInterface = None,
|
|
146
152
|
use_auth_proxies: bool = False,
|
|
147
|
-
max_proxy_delay:
|
|
153
|
+
max_proxy_delay: timedelta = timedelta(minutes=10),
|
|
154
|
+
ignored_status_codes: Sequence[int] = (),
|
|
148
155
|
**kwargs,
|
|
149
156
|
):
|
|
157
|
+
"""
|
|
158
|
+
Fetches data from a given URL using HTTP requests, with support for proxy configuration, retries, and flexible output formats.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
url (str): The target URL to fetch data from.
|
|
162
|
+
method (str, optional): HTTP method to use (e.g., 'GET', 'POST'). Defaults to 'GET'.
|
|
163
|
+
output (str, optional): Output format ('json', 'text', 'soup', 'response'). Defaults to 'json'.
|
|
164
|
+
sleep (tuple, optional): Tuple specifying max and min sleep times (seconds) after request. Defaults to (6, 3).
|
|
165
|
+
proxy_interface (ProxyInterface, optional): Proxy provider. If None, no proxy is used. Defaults to None.
|
|
166
|
+
use_auth_proxies (bool, optional): Whether to use authenticated proxies. Defaults to False.
|
|
167
|
+
max_proxy_delay (timedelta, optional): Maximum delay for proxy retry logic. Defaults to 10 minutes.
|
|
168
|
+
ignored_status_codes (Sequence[int], optional): Status codes to ignore and return response for. Defaults to ().
|
|
169
|
+
**kwargs: Additional arguments passed to the requests method.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Depends on the 'output' argument:
|
|
173
|
+
- 'json': Parsed JSON response.
|
|
174
|
+
- 'text': Response text.
|
|
175
|
+
- 'soup': BeautifulSoup-parsed HTML.
|
|
176
|
+
- 'response': Raw requests.Response object.
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
ManagedHTTPError: If a response status code is in `ignored_status_codes`.
|
|
180
|
+
NotFoundError: If a 404 status code is returned.
|
|
181
|
+
BadRequestError: If a 400 status code is returned.
|
|
182
|
+
RedirectionDetectedError, ProxyError: On specific error conditions.
|
|
183
|
+
requests.HTTPError: For other HTTP errors if not ignored.
|
|
184
|
+
"""
|
|
150
185
|
retry_type = retry_if_exception_type(ProxyError)
|
|
151
186
|
wait = wait_exponential(exp_base=3, multiplier=3, max=60)
|
|
152
187
|
stop = stop_after_delay(max_proxy_delay)
|
|
@@ -173,9 +208,15 @@ def get_data(
|
|
|
173
208
|
|
|
174
209
|
ban_sleep(*sleep)
|
|
175
210
|
|
|
211
|
+
if r.status_code in ignored_status_codes:
|
|
212
|
+
raise ManagedHTTPError(r, url=url, message=f"Status {r.status_code} in ignored_status_codes for URL {url}")
|
|
176
213
|
if r.status_code == 404:
|
|
177
214
|
raise NotFoundError(f"404 Not Found error for {url}")
|
|
215
|
+
if r.status_code == 400:
|
|
216
|
+
raise BadRequestError(f"400 Bad Request error for {url}")
|
|
217
|
+
|
|
178
218
|
r.raise_for_status()
|
|
219
|
+
|
|
179
220
|
r.encoding = "utf-8"
|
|
180
221
|
|
|
181
222
|
if output == "json":
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
########################################################################################################################
|
|
2
|
-
# CLASSES
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class RedirectionDetectedError(Exception):
|
|
6
|
-
def __init__(self, message="Redirection detected!"):
|
|
7
|
-
self.message = message
|
|
8
|
-
super().__init__(self.message)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class NotFoundError(Exception):
|
|
12
|
-
def __init__(self, message="Not found!"):
|
|
13
|
-
self.message = message
|
|
14
|
-
super().__init__(self.message)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|