datamarket 0.7.100__py3-none-any.whl → 0.7.102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -2,6 +2,9 @@
2
2
  # CLASSES
3
3
 
4
4
 
5
+ import requests
6
+
7
+
5
8
  class RedirectionDetectedError(Exception):
6
9
  def __init__(self, message="Redirection detected!"):
7
10
  self.message = message
@@ -12,3 +15,19 @@ class NotFoundError(Exception):
12
15
  def __init__(self, message="Not found!"):
13
16
  self.message = message
14
17
  super().__init__(self.message)
18
+
19
+
20
+ class BadRequestError(Exception):
21
+ def __init__(self, message="Bad request!"):
22
+ self.message = message
23
+ super().__init__(self.message)
24
+
25
+ class ManagedHTTPError(Exception):
26
+ """Signal that this HTTP status was handled and should not be retried."""
27
+ def __init__(self, response: requests.Response, *, url: str | None = None, message: str | None = None):
28
+ self.response = response
29
+ self.request = getattr(response, "request", None)
30
+ self.status_code = getattr(response, "status_code", None)
31
+ self.url = url or (self.request.url if self.request is not None else None)
32
+ self.message = message
33
+ super().__init__(message or f"HTTP {self.status_code} for {self.url}")
datamarket/utils/main.py CHANGED
@@ -3,12 +3,14 @@
3
3
 
4
4
  import asyncio
5
5
  import configparser
6
+ from datetime import timedelta
6
7
  import logging
7
8
  import random
8
9
  import re
9
10
  import shlex
10
11
  import subprocess
11
12
  import time
13
+ from typing import Sequence
12
14
  from babel.numbers import parse_decimal
13
15
 
14
16
  from bs4 import BeautifulSoup
@@ -25,7 +27,9 @@ from tenacity import (
25
27
  wait_exponential,
26
28
  )
27
29
 
28
- from ..exceptions import RedirectionDetectedError, NotFoundError
30
+ from datamarket.exceptions.main import ManagedHTTPError
31
+
32
+ from ..exceptions import RedirectionDetectedError, NotFoundError, BadRequestError
29
33
  from ..interfaces.proxy import ProxyInterface
30
34
 
31
35
  ########################################################################################################################
@@ -131,7 +135,9 @@ def parse_field(dict_struct, field_path, format_method=None):
131
135
 
132
136
 
133
137
  @retry(
134
- retry=retry_if_not_exception_type((NotFoundError, RedirectionDetectedError, ProxyError)),
138
+ retry=retry_if_not_exception_type(
139
+ (NotFoundError, BadRequestError, RedirectionDetectedError, ProxyError, ManagedHTTPError)
140
+ ),
135
141
  wait=wait_exponential(exp_base=3, multiplier=3, max=60),
136
142
  stop=stop_after_attempt(5),
137
143
  before_sleep=before_sleep_log(logger, logging.WARNING),
@@ -144,9 +150,38 @@ def get_data(
144
150
  sleep: tuple = (6, 3),
145
151
  proxy_interface: ProxyInterface = None,
146
152
  use_auth_proxies: bool = False,
147
- max_proxy_delay: int = 1800,
153
+ max_proxy_delay: timedelta = timedelta(minutes=10),
154
+ ignored_status_codes: Sequence[int] = (),
148
155
  **kwargs,
149
156
  ):
157
+ """
158
+ Fetches data from a given URL using HTTP requests, with support for proxy configuration, retries, and flexible output formats.
159
+
160
+ Args:
161
+ url (str): The target URL to fetch data from.
162
+ method (str, optional): HTTP method to use (e.g., 'GET', 'POST'). Defaults to 'GET'.
163
+ output (str, optional): Output format ('json', 'text', 'soup', 'response'). Defaults to 'json'.
164
+ sleep (tuple, optional): Tuple specifying max and min sleep times (seconds) after request. Defaults to (6, 3).
165
+ proxy_interface (ProxyInterface, optional): Proxy provider. If None, no proxy is used. Defaults to None.
166
+ use_auth_proxies (bool, optional): Whether to use authenticated proxies. Defaults to False.
167
+ max_proxy_delay (timedelta, optional): Maximum delay for proxy retry logic. Defaults to 10 minutes.
168
+ ignored_status_codes (Sequence[int], optional): Status codes to ignore and return response for. Defaults to ().
169
+ **kwargs: Additional arguments passed to the requests method.
170
+
171
+ Returns:
172
+ Depends on the 'output' argument:
173
+ - 'json': Parsed JSON response.
174
+ - 'text': Response text.
175
+ - 'soup': BeautifulSoup-parsed HTML.
176
+ - 'response': Raw requests.Response object.
177
+
178
+ Raises:
179
+ ManagedHTTPError: If a response status code is in `ignored_status_codes`.
180
+ NotFoundError: If a 404 status code is returned.
181
+ BadRequestError: If a 400 status code is returned.
182
+ RedirectionDetectedError, ProxyError: On specific error conditions.
183
+ requests.HTTPError: For other HTTP errors if not ignored.
184
+ """
150
185
  retry_type = retry_if_exception_type(ProxyError)
151
186
  wait = wait_exponential(exp_base=3, multiplier=3, max=60)
152
187
  stop = stop_after_delay(max_proxy_delay)
@@ -173,9 +208,15 @@ def get_data(
173
208
 
174
209
  ban_sleep(*sleep)
175
210
 
211
+ if r.status_code in ignored_status_codes:
212
+ raise ManagedHTTPError(r, url=url, message=f"Status {r.status_code} in ignored_status_codes for URL {url}")
176
213
  if r.status_code == 404:
177
214
  raise NotFoundError(f"404 Not Found error for {url}")
215
+ if r.status_code == 400:
216
+ raise BadRequestError(f"400 Bad Request error for {url}")
217
+
178
218
  r.raise_for_status()
219
+
179
220
  r.encoding = "utf-8"
180
221
 
181
222
  if output == "json":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.100
3
+ Version: 0.7.102
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  datamarket/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  datamarket/exceptions/__init__.py,sha256=-Vu-RZNKjW6fYCLqbUJTkKNuHeA8Yi_gyR50oZNaA_8,33
3
- datamarket/exceptions/main.py,sha256=MP5ql6M7DoMbBf-Dg_2ohcUFdWXgzv-dXHntPPit31s,453
3
+ datamarket/exceptions/main.py,sha256=SuP-ZKZIxJYdnOpNb63Y7BpYGRhLl-4JIyTEqgUoWV4,1205
4
4
  datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  datamarket/interfaces/alchemy.py,sha256=i2lKLLLy3-jpbzV3-jxfRCXTy7jRoTsNU3063pmSonk,15749
6
6
  datamarket/interfaces/aws.py,sha256=co5JkC3iFIp-0FqdYX4eKy3_m71LhZKuJoW6kXwEImc,4780
@@ -16,7 +16,7 @@ datamarket/params/nominatim.py,sha256=RnmYXGoJQCijOsuCavCYcxw98WvOd_vOMK4KaraI0R
16
16
  datamarket/utils/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
17
17
  datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
18
18
  datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
19
- datamarket/utils/main.py,sha256=KYHjDOps6_Q3TFV_Jj7MLj-L9Evx05AXELCvp06BARU,5857
19
+ datamarket/utils/main.py,sha256=OORsHggUqa2lKj5AG5LTPzEvXfAtx3ry4rSaAwkuS38,8001
20
20
  datamarket/utils/nominatim.py,sha256=IxexKY2KOlDhiKtzsqQfoVUjJXPxJl7tn3iHUaQKg08,5795
21
21
  datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  datamarket/utils/playwright/async_api.py,sha256=UbA2D4ScBtYeMfrRjly4RO-s8wXIub9c05J1eoOCpsQ,5782
@@ -29,7 +29,7 @@ datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnm
29
29
  datamarket/utils/strings/standardization.py,sha256=c8CAG6HI3AfK0hB3A3IGwsbnQebZ6R3PrA5PELHRXM0,1492
30
30
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
31
31
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
32
- datamarket-0.7.100.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
- datamarket-0.7.100.dist-info/METADATA,sha256=ZzGfCV51bIyPYJVdCSfJDdX8YuC9_BjKR1VCoRtd6yI,7382
34
- datamarket-0.7.100.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
35
- datamarket-0.7.100.dist-info/RECORD,,
32
+ datamarket-0.7.102.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
+ datamarket-0.7.102.dist-info/METADATA,sha256=FQcOMGhkANO_QeIbb9ISfm_MwcRCEo3TqCeVV5PONnI,7382
34
+ datamarket-0.7.102.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
35
+ datamarket-0.7.102.dist-info/RECORD,,