datamarket 0.9.39__tar.gz → 0.9.41__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- {datamarket-0.9.39 → datamarket-0.9.41}/PKG-INFO +3 -1
- {datamarket-0.9.39 → datamarket-0.9.41}/pyproject.toml +3 -1
- datamarket-0.9.41/src/datamarket/exceptions/__init__.py +1 -0
- datamarket-0.9.41/src/datamarket/exceptions/main.py +14 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/interfaces/nominatim.py +102 -50
- datamarket-0.9.41/src/datamarket/utils/__init__.py +1 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/main.py +79 -5
- datamarket-0.9.39/src/datamarket/utils/__init__.py +0 -1
- {datamarket-0.9.39 → datamarket-0.9.41}/LICENSE +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/README.md +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/__init__.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/interfaces/__init__.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/interfaces/alchemy.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/interfaces/aws.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/interfaces/drive.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/interfaces/ftp.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/interfaces/peerdb.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/interfaces/proxy.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/interfaces/tinybird.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/params/__init__.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/params/nominatim.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/airflow.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/alchemy.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/playwright/__init__.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/playwright/async_api.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/playwright/sync_api.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/selenium.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/soda.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/strings/__init__.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/strings/normalization.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/strings/obfuscation.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/typer.py +0 -0
- {datamarket-0.9.39 → datamarket-0.9.41}/src/datamarket/utils/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: datamarket
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.41
|
|
4
4
|
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
5
|
License: GPL-3.0-or-later
|
|
6
6
|
Author: DataMarket
|
|
@@ -48,6 +48,7 @@ Provides-Extra: proxy
|
|
|
48
48
|
Provides-Extra: pyarrow
|
|
49
49
|
Provides-Extra: pydrive2
|
|
50
50
|
Provides-Extra: pymupdf
|
|
51
|
+
Provides-Extra: pyrate-limiter
|
|
51
52
|
Provides-Extra: pysocks
|
|
52
53
|
Provides-Extra: pyspark
|
|
53
54
|
Provides-Extra: pytest
|
|
@@ -105,6 +106,7 @@ Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
|
|
|
105
106
|
Requires-Dist: pycountry (>=24.0.0,<25.0.0)
|
|
106
107
|
Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
|
|
107
108
|
Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
|
|
109
|
+
Requires-Dist: pyrate-limiter (>=3.0.0,<4.0.0) ; extra == "pyrate-limiter"
|
|
108
110
|
Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
|
|
109
111
|
Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
|
|
110
112
|
Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "datamarket"
|
|
3
|
-
version = "0.9.
|
|
3
|
+
version = "0.9.41"
|
|
4
4
|
description = "Utilities that integrate advanced scraping knowledge into just one library."
|
|
5
5
|
authors = ["DataMarket <techsupport@datamarket.es>"]
|
|
6
6
|
license = "GPL-3.0-or-later"
|
|
@@ -83,6 +83,7 @@ presidio-anonymizer = { version = "^2.0.0", optional = true }
|
|
|
83
83
|
spacy = { version = "^3.0.0", optional = true }
|
|
84
84
|
spacy-langdetect = { version = "~0.1.0", optional = true }
|
|
85
85
|
pandarallel = { version = "^1.0.0", optional = true }
|
|
86
|
+
pyrate-limiter = { version = "^3.0.0", optional = true }
|
|
86
87
|
|
|
87
88
|
[tool.poetry.extras]
|
|
88
89
|
boto3 = ["boto3"]
|
|
@@ -128,6 +129,7 @@ openpyxl = ["openpyxl"]
|
|
|
128
129
|
httpx = ["httpx"]
|
|
129
130
|
camoufox = ["camoufox", "browserforge", "playwright"]
|
|
130
131
|
pandarallel = ["pandarallel"]
|
|
132
|
+
pyrate-limiter = ["pyrate-limiter"]
|
|
131
133
|
|
|
132
134
|
# Interface groups
|
|
133
135
|
aws = ["boto3"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .main import * # noqa: F403
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# CLASSES
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RedirectionDetectedError(Exception):
|
|
6
|
+
def __init__(self, message="Redirection detected!"):
|
|
7
|
+
self.message = message
|
|
8
|
+
super().__init__(self.message)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class NotFoundError(Exception):
|
|
12
|
+
def __init__(self, message="Not found!"):
|
|
13
|
+
self.message = message
|
|
14
|
+
super().__init__(self.message)
|
|
@@ -17,6 +17,7 @@ from ..utils.strings import normalize
|
|
|
17
17
|
# PARAMETERS
|
|
18
18
|
|
|
19
19
|
JARO_WINKLER_THRESHOLD = 0.85
|
|
20
|
+
CLOSE_KM = 2.0
|
|
20
21
|
|
|
21
22
|
########################################################################################################################
|
|
22
23
|
# CLASSES
|
|
@@ -145,55 +146,86 @@ class Nominatim:
|
|
|
145
146
|
parsed_nominatim_result: Dict[str, Optional[str]],
|
|
146
147
|
parsed_geonames_result: Dict[str, Optional[str]],
|
|
147
148
|
nominatim_address_province_raw: Optional[str],
|
|
148
|
-
|
|
149
|
+
dist_nominatim: float, # distance Nominatim ↔ input (km)
|
|
150
|
+
dist_geonames: float, # distance GeoNames ↔ input (km)
|
|
151
|
+
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
|
149
152
|
"""
|
|
150
|
-
|
|
151
|
-
|
|
153
|
+
Decide the authoritative postcode, the province derived from it and the associated state.
|
|
154
|
+
|
|
155
|
+
Strategy:
|
|
156
|
+
1. Derive province from each postcode.
|
|
157
|
+
2. Validate each postcode–province pair:
|
|
158
|
+
• Nominatim: compare with raw province string (if present).
|
|
159
|
+
• GeoNames: multi-step validation (raw province, then Nominatim-derived
|
|
160
|
+
province when Nominatim coords are close, then distance fallback).
|
|
161
|
+
3. Return the postcode/province that passes validation with precedence:
|
|
162
|
+
Nominatim > GeoNames. Returns (None, None, None) if neither passes.
|
|
152
163
|
"""
|
|
164
|
+
|
|
165
|
+
# --- Extract postcodes ---
|
|
153
166
|
nominatim_postcode = parsed_nominatim_result.get("postcode")
|
|
154
167
|
geonames_postcode = parsed_geonames_result.get("postcode")
|
|
155
168
|
|
|
156
|
-
|
|
157
|
-
|
|
169
|
+
# --- Province derived from each postcode ---
|
|
170
|
+
province_from_nominatim_pc = self.geonames.get_province_from_postcode(nominatim_postcode)
|
|
171
|
+
province_from_geonames_pc = self.geonames.get_province_from_postcode(geonames_postcode)
|
|
158
172
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
normalize(province_from_nominatim_postcode) if province_from_nominatim_postcode else ""
|
|
164
|
-
)
|
|
165
|
-
norm_province_from_geonames_postcode = (
|
|
166
|
-
normalize(province_from_geonames_postcode) if province_from_geonames_postcode else ""
|
|
167
|
-
)
|
|
173
|
+
# --- Normalised strings for similarity comparisons ---
|
|
174
|
+
norm_raw_province = normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
|
|
175
|
+
norm_province_from_nominatim_pc = normalize(province_from_nominatim_pc) if province_from_nominatim_pc else ""
|
|
176
|
+
norm_province_from_geonames_pc = normalize(province_from_geonames_pc) if province_from_geonames_pc else ""
|
|
168
177
|
|
|
169
|
-
|
|
170
|
-
|
|
178
|
+
# --- Distance heuristics ---
|
|
179
|
+
nominatim_is_close = dist_nominatim < CLOSE_KM
|
|
180
|
+
geonames_is_close = dist_geonames < CLOSE_KM
|
|
171
181
|
|
|
172
|
-
#
|
|
173
|
-
|
|
174
|
-
if
|
|
175
|
-
|
|
176
|
-
jaro_winkler_similarity(
|
|
177
|
-
> JARO_WINKLER_THRESHOLD
|
|
182
|
+
# --- Validate Nominatim postcode ---
|
|
183
|
+
nominatim_pc_valid = False
|
|
184
|
+
if norm_province_from_nominatim_pc and norm_raw_province:
|
|
185
|
+
nominatim_pc_valid = (
|
|
186
|
+
jaro_winkler_similarity(norm_province_from_nominatim_pc, norm_raw_province) > JARO_WINKLER_THRESHOLD
|
|
178
187
|
)
|
|
179
188
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
geonames_postcode_province_matches = (
|
|
183
|
-
jaro_winkler_similarity(norm_province_from_geonames_postcode, norm_raw_nominatim_province)
|
|
184
|
-
> JARO_WINKLER_THRESHOLD
|
|
185
|
-
)
|
|
189
|
+
# --- Validate GeoNames postcode ---
|
|
190
|
+
geonames_pc_valid = False
|
|
186
191
|
|
|
187
|
-
#
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
if geonames_postcode_province_matches and not nominatim_postcode_province_matches:
|
|
193
|
-
selected_postcode = geonames_postcode
|
|
194
|
-
selected_province_from_postcode = province_from_geonames_postcode
|
|
192
|
+
# 1) Compare with raw province string (if exists)
|
|
193
|
+
if norm_province_from_geonames_pc and norm_raw_province:
|
|
194
|
+
geonames_pc_valid = (
|
|
195
|
+
jaro_winkler_similarity(norm_province_from_geonames_pc, norm_raw_province) > JARO_WINKLER_THRESHOLD
|
|
196
|
+
)
|
|
195
197
|
|
|
196
|
-
|
|
198
|
+
# 2) If no raw province, compare with province from Nominatim PC **only when** Nominatim is close
|
|
199
|
+
if not geonames_pc_valid and not norm_raw_province and nominatim_is_close:
|
|
200
|
+
if norm_province_from_geonames_pc and norm_province_from_nominatim_pc:
|
|
201
|
+
geonames_pc_valid = (
|
|
202
|
+
jaro_winkler_similarity(norm_province_from_geonames_pc, norm_province_from_nominatim_pc)
|
|
203
|
+
> JARO_WINKLER_THRESHOLD
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# 3) Fallback: accept GeoNames PC if its coordinates are very close
|
|
207
|
+
if not geonames_pc_valid and geonames_is_close and geonames_postcode:
|
|
208
|
+
geonames_pc_valid = True
|
|
209
|
+
|
|
210
|
+
# --- Select authoritative tuple ---
|
|
211
|
+
postcode = None
|
|
212
|
+
province = None
|
|
213
|
+
state = None
|
|
214
|
+
|
|
215
|
+
if nominatim_pc_valid:
|
|
216
|
+
postcode = nominatim_postcode
|
|
217
|
+
province = province_from_nominatim_pc
|
|
218
|
+
state = parsed_nominatim_result.get("state")
|
|
219
|
+
if not state and geonames_pc_valid:
|
|
220
|
+
state = parsed_geonames_result.get("state")
|
|
221
|
+
elif geonames_pc_valid:
|
|
222
|
+
postcode = geonames_postcode
|
|
223
|
+
province = province_from_geonames_pc
|
|
224
|
+
state = parsed_geonames_result.get("state")
|
|
225
|
+
if not state and nominatim_pc_valid:
|
|
226
|
+
state = parsed_nominatim_result.get("state")
|
|
227
|
+
|
|
228
|
+
return postcode, province, state
|
|
197
229
|
|
|
198
230
|
def _select_final_result(
|
|
199
231
|
self,
|
|
@@ -203,21 +235,41 @@ class Nominatim:
|
|
|
203
235
|
dist_geonames: float,
|
|
204
236
|
authoritative_postcode: Optional[str],
|
|
205
237
|
authoritative_province_from_postcode: Optional[str],
|
|
206
|
-
|
|
238
|
+
authoritative_state: Optional[str],
|
|
207
239
|
) -> Dict[str, Optional[str]]:
|
|
208
240
|
"""
|
|
209
|
-
|
|
241
|
+
Choose the address block (Nominatim vs GeoNames) based on distance,
|
|
242
|
+
then apply the authoritative postcode/province.
|
|
243
|
+
|
|
244
|
+
Rules:
|
|
245
|
+
• Pick the source with the smaller finite distance.
|
|
246
|
+
• Always overwrite 'postcode' if authoritative_postcode is present.
|
|
247
|
+
• Overwrite 'province' only when authoritative_province_from_postcode is not None.
|
|
248
|
+
• If both distances are ∞, return an empty address.
|
|
210
249
|
"""
|
|
250
|
+
|
|
251
|
+
# ------------------------------------------------------------------ #
|
|
252
|
+
# 1. Decide the base address block #
|
|
253
|
+
# ------------------------------------------------------------------ #
|
|
211
254
|
if dist_nominatim <= dist_geonames and dist_nominatim != float("inf"):
|
|
212
255
|
final_result = parsed_nominatim_result
|
|
213
|
-
final_result["postcode"] = authoritative_postcode
|
|
214
|
-
final_result["province"] = nominatim_province
|
|
215
256
|
elif dist_geonames < dist_nominatim and dist_geonames != float("inf"):
|
|
216
257
|
final_result = parsed_geonames_result
|
|
258
|
+
else:
|
|
259
|
+
return self._get_empty_address_result()
|
|
260
|
+
|
|
261
|
+
# ------------------------------------------------------------------ #
|
|
262
|
+
# 2. Apply authoritative postcode / province #
|
|
263
|
+
# ------------------------------------------------------------------ #
|
|
264
|
+
if authoritative_postcode:
|
|
217
265
|
final_result["postcode"] = authoritative_postcode
|
|
266
|
+
|
|
267
|
+
if authoritative_province_from_postcode:
|
|
218
268
|
final_result["province"] = authoritative_province_from_postcode
|
|
219
|
-
|
|
220
|
-
|
|
269
|
+
|
|
270
|
+
if authoritative_province_from_postcode:
|
|
271
|
+
final_result["state"] = authoritative_state
|
|
272
|
+
|
|
221
273
|
return final_result
|
|
222
274
|
|
|
223
275
|
def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
|
@@ -250,12 +302,6 @@ class Nominatim:
|
|
|
250
302
|
parsed_nominatim_result = self._parse_nominatim_result(nominatim_response)
|
|
251
303
|
parsed_geonames_result = self._parse_geonames_result(geonames_response)
|
|
252
304
|
|
|
253
|
-
# Determine authoritative postcode
|
|
254
|
-
nominatim_province = parsed_nominatim_result.get("province")
|
|
255
|
-
selected_postcode, selected_province_from_postcode = self._select_postcode_and_derived_province(
|
|
256
|
-
parsed_nominatim_result, parsed_geonames_result, nominatim_province
|
|
257
|
-
)
|
|
258
|
-
|
|
259
305
|
# Calculate distances
|
|
260
306
|
nominatim_response_lat = nominatim_response.get("lat")
|
|
261
307
|
nominatim_response_lon = nominatim_response.get("lon")
|
|
@@ -272,6 +318,12 @@ class Nominatim:
|
|
|
272
318
|
dist_nominatim = self._calculate_distance(nominatim_response_lat, nominatim_response_lon, input_coords)
|
|
273
319
|
dist_geonames = self._calculate_distance(geonames_response_lat, geonames_response_lon, input_coords)
|
|
274
320
|
|
|
321
|
+
# Determine authoritative postcode
|
|
322
|
+
nominatim_province = parsed_nominatim_result.get("province")
|
|
323
|
+
selected_postcode, selected_province_from_postcode, selected_state = self._select_postcode_and_derived_province(
|
|
324
|
+
parsed_nominatim_result, parsed_geonames_result, nominatim_province, dist_nominatim, dist_geonames
|
|
325
|
+
)
|
|
326
|
+
|
|
275
327
|
# Select final result
|
|
276
328
|
final_result = self._select_final_result(
|
|
277
329
|
parsed_nominatim_result,
|
|
@@ -280,7 +332,7 @@ class Nominatim:
|
|
|
280
332
|
dist_geonames,
|
|
281
333
|
selected_postcode,
|
|
282
334
|
selected_province_from_postcode,
|
|
283
|
-
|
|
335
|
+
selected_state,
|
|
284
336
|
)
|
|
285
337
|
|
|
286
338
|
return final_result
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .main import * # noqa: F403
|
|
@@ -13,8 +13,26 @@ from pathlib import Path
|
|
|
13
13
|
from typing import Any, Literal, Self, Union
|
|
14
14
|
|
|
15
15
|
import pendulum
|
|
16
|
+
import requests
|
|
17
|
+
from bs4 import BeautifulSoup
|
|
16
18
|
from croniter import croniter
|
|
17
19
|
from dynaconf import Dynaconf, add_converter
|
|
20
|
+
from requests.exceptions import ProxyError
|
|
21
|
+
from tenacity import (
|
|
22
|
+
before_sleep_log,
|
|
23
|
+
retry,
|
|
24
|
+
retry_if_exception_type,
|
|
25
|
+
retry_if_not_exception_type,
|
|
26
|
+
stop_after_attempt,
|
|
27
|
+
stop_after_delay,
|
|
28
|
+
wait_exponential,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
from ..exceptions import NotFoundError, RedirectionDetectedError
|
|
32
|
+
from ..interfaces.proxy import ProxyInterface
|
|
33
|
+
|
|
34
|
+
########################################################################################################################
|
|
35
|
+
# FUNCTIONS
|
|
18
36
|
|
|
19
37
|
logger = logging.getLogger(__name__)
|
|
20
38
|
|
|
@@ -26,9 +44,7 @@ class NoProjectFoundError(Exception):
|
|
|
26
44
|
|
|
27
45
|
class NoPackageFoundError(Exception):
|
|
28
46
|
def __init__(self):
|
|
29
|
-
super().__init__(
|
|
30
|
-
"A project was detected but it has no packages inside the 'src' directory"
|
|
31
|
-
)
|
|
47
|
+
super().__init__("A project was detected but it has no packages inside the 'src' directory")
|
|
32
48
|
|
|
33
49
|
|
|
34
50
|
########################################################################################################################
|
|
@@ -84,8 +100,8 @@ class Project:
|
|
|
84
100
|
|
|
85
101
|
try:
|
|
86
102
|
self.pkg_name = next((self.path / "src").glob("*")).name
|
|
87
|
-
except StopIteration:
|
|
88
|
-
raise NoPackageFoundError()
|
|
103
|
+
except StopIteration as e:
|
|
104
|
+
raise NoPackageFoundError() from e
|
|
89
105
|
|
|
90
106
|
self.env_name = f"{self.pkg_name}_env"
|
|
91
107
|
self.config_path = self.path / self.CONFIG_FILE_NAME
|
|
@@ -227,3 +243,61 @@ def parse_field(dict_struct, field_path, format_method=None):
|
|
|
227
243
|
if field_value is None:
|
|
228
244
|
return None
|
|
229
245
|
return format_method(field_value) if format_method else field_value
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@retry(
|
|
249
|
+
retry=retry_if_not_exception_type((NotFoundError, RedirectionDetectedError, ProxyError)),
|
|
250
|
+
wait=wait_exponential(exp_base=3, multiplier=3, max=60),
|
|
251
|
+
stop=stop_after_attempt(5),
|
|
252
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
253
|
+
reraise=True,
|
|
254
|
+
)
|
|
255
|
+
def get_data(
|
|
256
|
+
url: str,
|
|
257
|
+
method: str = "GET",
|
|
258
|
+
output: str = "json",
|
|
259
|
+
sleep: tuple = (6, 3),
|
|
260
|
+
proxy_interface: ProxyInterface = None,
|
|
261
|
+
use_auth_proxies: bool = False,
|
|
262
|
+
max_proxy_delay: int = 1800,
|
|
263
|
+
**kwargs,
|
|
264
|
+
):
|
|
265
|
+
retry_type = retry_if_exception_type(ProxyError)
|
|
266
|
+
wait = wait_exponential(exp_base=3, multiplier=3, max=60)
|
|
267
|
+
stop = stop_after_delay(max_proxy_delay)
|
|
268
|
+
before_sleep = before_sleep_log(logger, logging.WARNING)
|
|
269
|
+
|
|
270
|
+
@retry(retry=retry_type, wait=wait, stop=stop, before_sleep=before_sleep, reraise=True)
|
|
271
|
+
def _fetch_with_proxy_retry(url, method, proxy_interface, use_auth, **params):
|
|
272
|
+
logger.info(f"Fetching data from {url} ...")
|
|
273
|
+
proxy_cfg = None
|
|
274
|
+
if proxy_interface:
|
|
275
|
+
host, port, user, pwd = proxy_interface.get_proxies(raw=True, use_auth=use_auth)
|
|
276
|
+
if host and port:
|
|
277
|
+
proxy_url = f"http://{host}:{port}"
|
|
278
|
+
proxy_auth_url = f"http://{user}:{pwd}@{host}:{port}"
|
|
279
|
+
proxy_cfg = {"http": proxy_url, "https": proxy_url}
|
|
280
|
+
if user and pwd:
|
|
281
|
+
proxy_cfg = {"http": proxy_auth_url, "https": proxy_auth_url}
|
|
282
|
+
logger.info(f"Using proxy: {proxy_url}")
|
|
283
|
+
response = getattr(requests, method.lower())(url, proxies=proxy_cfg, **params)
|
|
284
|
+
return response
|
|
285
|
+
|
|
286
|
+
params = {"timeout": 30} | kwargs
|
|
287
|
+
r = _fetch_with_proxy_retry(url, method, proxy_interface, use_auth_proxies, **params)
|
|
288
|
+
|
|
289
|
+
ban_sleep(*sleep)
|
|
290
|
+
|
|
291
|
+
if r.status_code == 404:
|
|
292
|
+
raise NotFoundError(f"404 Not Found error for {url}")
|
|
293
|
+
r.raise_for_status()
|
|
294
|
+
r.encoding = "utf-8"
|
|
295
|
+
|
|
296
|
+
if output == "json":
|
|
297
|
+
return r.json()
|
|
298
|
+
elif output == "text":
|
|
299
|
+
return r.text
|
|
300
|
+
elif output == "soup":
|
|
301
|
+
return BeautifulSoup(r.content, "html.parser")
|
|
302
|
+
elif output == "response":
|
|
303
|
+
return r
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from .main import *
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|