datamarket 0.7.89__py3-none-any.whl → 0.7.125__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamarket/exceptions/__init__.py +1 -1
- datamarket/exceptions/main.py +108 -4
- datamarket/interfaces/alchemy.py +30 -5
- datamarket/interfaces/aws.py +25 -3
- datamarket/interfaces/azure.py +18 -18
- datamarket/interfaces/ftp.py +3 -11
- datamarket/interfaces/nominatim.py +39 -28
- datamarket/interfaces/proxy.py +320 -74
- datamarket/interfaces/tinybird.py +4 -12
- datamarket/params/nominatim.py +434 -19
- datamarket/utils/airflow.py +10 -7
- datamarket/utils/alchemy.py +2 -1
- datamarket/utils/main.py +115 -67
- datamarket/utils/nominatim.py +201 -0
- datamarket/utils/playwright/async_api.py +150 -28
- datamarket/utils/playwright/sync_api.py +159 -27
- datamarket/utils/requests.py +653 -0
- datamarket/utils/selenium.py +6 -12
- datamarket/utils/strings/normalization.py +0 -1
- datamarket/utils/strings/standardization.py +40 -0
- datamarket/utils/typer.py +2 -1
- {datamarket-0.7.89.dist-info → datamarket-0.7.125.dist-info}/METADATA +11 -10
- datamarket-0.7.125.dist-info/RECORD +36 -0
- {datamarket-0.7.89.dist-info → datamarket-0.7.125.dist-info}/WHEEL +1 -1
- datamarket-0.7.89.dist-info/RECORD +0 -33
- {datamarket-0.7.89.dist-info/licenses → datamarket-0.7.125.dist-info}/LICENSE +0 -0
|
@@ -1,29 +1,38 @@
|
|
|
1
1
|
########################################################################################################################
|
|
2
2
|
# IMPORTS
|
|
3
3
|
|
|
4
|
+
import json
|
|
4
5
|
import logging
|
|
5
6
|
import time
|
|
6
7
|
from datetime import timedelta
|
|
7
8
|
from random import randint
|
|
8
9
|
from types import TracebackType
|
|
9
|
-
from typing import Optional, Self
|
|
10
|
+
from typing import Optional, Self, Sequence
|
|
10
11
|
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
11
13
|
from camoufox import Camoufox
|
|
14
|
+
from playwright.sync_api import Browser, BrowserContext, Page, Response
|
|
12
15
|
from playwright.sync_api import (
|
|
13
|
-
Browser,
|
|
14
|
-
BrowserContext,
|
|
15
16
|
Error as PlaywrightError,
|
|
16
|
-
|
|
17
|
+
)
|
|
18
|
+
from playwright.sync_api import (
|
|
17
19
|
TimeoutError as PlaywrightTimeoutError,
|
|
18
20
|
)
|
|
21
|
+
from requests.exceptions import HTTPError, ProxyError
|
|
19
22
|
from tenacity import (
|
|
20
23
|
before_sleep_log,
|
|
21
24
|
retry,
|
|
22
25
|
retry_if_exception_type,
|
|
26
|
+
retry_if_not_exception_type,
|
|
27
|
+
stop_after_attempt,
|
|
23
28
|
stop_after_delay,
|
|
24
29
|
wait_exponential,
|
|
25
30
|
)
|
|
31
|
+
|
|
32
|
+
from datamarket.exceptions import BadRequestError, EmptyResponseError, NotFoundError, RedirectionDetectedError
|
|
33
|
+
from datamarket.exceptions.main import IgnoredHTTPError
|
|
26
34
|
from datamarket.interfaces.proxy import ProxyInterface
|
|
35
|
+
from datamarket.utils.main import ban_sleep
|
|
27
36
|
|
|
28
37
|
########################################################################################################################
|
|
29
38
|
# SETUP LOGGER
|
|
@@ -33,32 +42,38 @@ logger = logging.getLogger(__name__)
|
|
|
33
42
|
|
|
34
43
|
########################################################################################################################
|
|
35
44
|
# HELPER FUNCTIONS
|
|
45
|
+
|
|
46
|
+
|
|
36
47
|
def human_type(page: Page, text: str, delay: int = 100):
|
|
37
48
|
for char in text:
|
|
38
|
-
page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5)))
|
|
49
|
+
page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
|
|
39
50
|
|
|
40
51
|
|
|
41
52
|
def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_sleep: bool = True) -> None:
|
|
42
53
|
"""Presses a key with a random delay, optionally sleeping between presses."""
|
|
43
54
|
for _ in range(count):
|
|
44
|
-
page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5)))
|
|
55
|
+
page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
|
|
45
56
|
if add_sleep:
|
|
46
|
-
time.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000)
|
|
57
|
+
time.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000) # noqa: S311
|
|
47
58
|
|
|
48
59
|
|
|
49
60
|
########################################################################################################################
|
|
50
|
-
#
|
|
61
|
+
# CLASSES
|
|
51
62
|
|
|
52
63
|
|
|
53
64
|
class PlaywrightCrawler:
|
|
54
65
|
"""A robust, proxy-enabled Playwright crawler with captcha bypass and retry logic."""
|
|
55
66
|
|
|
56
|
-
|
|
67
|
+
_REDIRECT_STATUS_CODES = set(range(300, 309))
|
|
68
|
+
|
|
69
|
+
def __init__(self, proxy_interface: Optional[ProxyInterface] = None):
|
|
57
70
|
"""
|
|
58
|
-
Initializes the crawler
|
|
71
|
+
Initializes the crawler.
|
|
59
72
|
|
|
60
73
|
Args:
|
|
61
|
-
proxy_interface (ProxyInterface):
|
|
74
|
+
proxy_interface (Optional[ProxyInterface], optional): Provider used to fetch
|
|
75
|
+
proxy credentials. Defaults to None. When None, no proxy is configured and
|
|
76
|
+
the browser will run without a proxy.
|
|
62
77
|
"""
|
|
63
78
|
self.proxy_interface = proxy_interface
|
|
64
79
|
self.pw: Optional[Camoufox] = None
|
|
@@ -79,7 +94,34 @@ class PlaywrightCrawler:
|
|
|
79
94
|
) -> None:
|
|
80
95
|
"""Safely closes the browser context upon exit."""
|
|
81
96
|
if self.pw:
|
|
82
|
-
|
|
97
|
+
try:
|
|
98
|
+
self.pw.__exit__(exc_type, exc_val, exc_tb)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.warning(f"Error closing browser: {e}")
|
|
101
|
+
finally:
|
|
102
|
+
self.pw = None
|
|
103
|
+
self.browser = None
|
|
104
|
+
self.context = None
|
|
105
|
+
self.page = None
|
|
106
|
+
|
|
107
|
+
def _build_proxy_config(self) -> Optional[dict]:
|
|
108
|
+
"""Builds the proxy configuration dictionary.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Optional[dict]: Proxy configuration if a proxy_interface is provided; otherwise None.
|
|
112
|
+
"""
|
|
113
|
+
if not self.proxy_interface:
|
|
114
|
+
logger.info("Starting browser without proxy.")
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
|
|
118
|
+
proxy_url = f"http://{host}:{port}"
|
|
119
|
+
proxy_cfg: dict = {"server": proxy_url}
|
|
120
|
+
if user and pwd:
|
|
121
|
+
proxy_cfg.update({"username": user, "password": pwd})
|
|
122
|
+
|
|
123
|
+
logger.info(f"Starting browser with proxy: {proxy_url}")
|
|
124
|
+
return proxy_cfg
|
|
83
125
|
|
|
84
126
|
@retry(
|
|
85
127
|
wait=wait_exponential(exp_base=2, multiplier=3, max=90),
|
|
@@ -88,16 +130,20 @@ class PlaywrightCrawler:
|
|
|
88
130
|
reraise=True,
|
|
89
131
|
)
|
|
90
132
|
def init_context(self) -> Self:
|
|
91
|
-
"""
|
|
92
|
-
|
|
93
|
-
host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
|
|
94
|
-
proxy_url = f"http://{host}:{port}"
|
|
95
|
-
proxy_cfg = {"server": proxy_url}
|
|
133
|
+
"""
|
|
134
|
+
Initializes a new browser instance and context.
|
|
96
135
|
|
|
97
|
-
|
|
98
|
-
|
|
136
|
+
Behavior:
|
|
137
|
+
- If a proxy_interface is provided, fetches fresh proxy credentials and starts
|
|
138
|
+
the browser using that proxy.
|
|
139
|
+
- If proxy_interface is None, starts the browser without any proxy.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Self: The crawler instance with active browser, context, and page.
|
|
143
|
+
"""
|
|
144
|
+
try:
|
|
145
|
+
proxy_cfg: Optional[dict] = self._build_proxy_config()
|
|
99
146
|
|
|
100
|
-
logger.info(f"Starting browser with proxy: {proxy_url}")
|
|
101
147
|
self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
|
|
102
148
|
self.browser = self.pw.__enter__()
|
|
103
149
|
self.context = self.browser.new_context()
|
|
@@ -124,7 +170,7 @@ class PlaywrightCrawler:
|
|
|
124
170
|
before=lambda rs: rs.args[0].restart_context() if rs.attempt_number > 1 else None,
|
|
125
171
|
reraise=True,
|
|
126
172
|
)
|
|
127
|
-
def _goto_with_retry(self, url: str) ->
|
|
173
|
+
def _goto_with_retry(self, url: str, timeout: int = 30_000) -> Response:
|
|
128
174
|
"""
|
|
129
175
|
Navigates to a URL with retries for common Playwright errors.
|
|
130
176
|
Restarts the browser context on repeated failures.
|
|
@@ -133,12 +179,10 @@ class PlaywrightCrawler:
|
|
|
133
179
|
logger.warning("Page is not available or closed. Restarting context.")
|
|
134
180
|
self.restart_context()
|
|
135
181
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
self.page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
|
139
|
-
return self.page
|
|
182
|
+
response = self.page.goto(url, timeout=timeout, wait_until="domcontentloaded")
|
|
183
|
+
return response
|
|
140
184
|
|
|
141
|
-
def goto(self, url: str) ->
|
|
185
|
+
def goto(self, url: str, max_proxy_delay: timedelta = timedelta(minutes=10), timeout: int = 30_000) -> Response:
|
|
142
186
|
"""
|
|
143
187
|
Ensures the browser is initialized and navigates to the given URL.
|
|
144
188
|
Public wrapper for the internal retry-enabled navigation method.
|
|
@@ -146,4 +190,92 @@ class PlaywrightCrawler:
|
|
|
146
190
|
if not self.page:
|
|
147
191
|
logger.info("Browser context not found, initializing now...")
|
|
148
192
|
self.init_context()
|
|
149
|
-
return self._goto_with_retry(url)
|
|
193
|
+
return self._goto_with_retry.retry_with(stop=stop_after_delay(max_proxy_delay))(self, url, timeout)
|
|
194
|
+
|
|
195
|
+
def _handle_http_error(self, status_code: int, url: str, response, allow_redirects: bool = True) -> None:
|
|
196
|
+
"""
|
|
197
|
+
Handle HTTP errors with special handling for redirects.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
status_code: HTTP status code
|
|
201
|
+
url: Request URL
|
|
202
|
+
response: Response object
|
|
203
|
+
|
|
204
|
+
Raises:
|
|
205
|
+
RedirectionDetectedError: If a redirect status is received
|
|
206
|
+
NotFoundError: For 404/410 errors
|
|
207
|
+
BadRequestError: For 400 errors
|
|
208
|
+
HTTPError: For other non-2xx status codes
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
if not allow_redirects and response.request.redirected_from: # noqa: F841
|
|
212
|
+
raise RedirectionDetectedError(
|
|
213
|
+
message=f"HTTP redirect detected from {response.request.redirected_from.url} to {response.request.redirected_from.redirected_to.url}",
|
|
214
|
+
response=response,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Standard error handlers
|
|
218
|
+
error_handlers = {
|
|
219
|
+
404: lambda: NotFoundError(message=f"404 Not Found error for {url}", response=response),
|
|
220
|
+
410: lambda: NotFoundError(message=f"410 Gone error for {url}", response=response),
|
|
221
|
+
400: lambda: BadRequestError(message=f"400 Bad Request error for {url}", response=response),
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if status_code in error_handlers:
|
|
225
|
+
raise error_handlers[status_code]()
|
|
226
|
+
|
|
227
|
+
# Raise for any other non-2xx status
|
|
228
|
+
if status_code >= 400:
|
|
229
|
+
raise HTTPError(f"Navigation failed: {status_code} - {url}", response=response)
|
|
230
|
+
|
|
231
|
+
@retry(
|
|
232
|
+
retry=retry_if_not_exception_type(
|
|
233
|
+
(IgnoredHTTPError, NotFoundError, BadRequestError, RedirectionDetectedError, ProxyError)
|
|
234
|
+
),
|
|
235
|
+
wait=wait_exponential(exp_base=3, multiplier=3, max=60),
|
|
236
|
+
stop=stop_after_attempt(5),
|
|
237
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
238
|
+
reraise=False,
|
|
239
|
+
retry_error_callback=lambda rs: None,
|
|
240
|
+
)
|
|
241
|
+
def get_data(
|
|
242
|
+
self,
|
|
243
|
+
url: str,
|
|
244
|
+
output: str = "json",
|
|
245
|
+
sleep: tuple = (6, 3),
|
|
246
|
+
max_proxy_delay: timedelta = timedelta(minutes=10),
|
|
247
|
+
ignored_status_codes: Sequence[int] = (),
|
|
248
|
+
timeout: int = 30_000,
|
|
249
|
+
**kwargs,
|
|
250
|
+
):
|
|
251
|
+
"""Crawls a given URL using Playwright and attempts to parse its body content"""
|
|
252
|
+
|
|
253
|
+
params = kwargs.copy()
|
|
254
|
+
|
|
255
|
+
allow_redirects = params.get("allow_redirects", True)
|
|
256
|
+
|
|
257
|
+
logger.info(f"Fetching data from {url} ...")
|
|
258
|
+
r = self.goto(url, max_proxy_delay, timeout)
|
|
259
|
+
ban_sleep(*sleep)
|
|
260
|
+
body_content = self.page.eval_on_selector("body", "body => body.innerText")
|
|
261
|
+
|
|
262
|
+
if r.status in ignored_status_codes:
|
|
263
|
+
raise IgnoredHTTPError(message=f"Status {r.status} in ignored_status_codes for URL {url}", response=r)
|
|
264
|
+
|
|
265
|
+
# Handle HTTP errors with redirect detection
|
|
266
|
+
self._handle_http_error(r.status, url, r, allow_redirects)
|
|
267
|
+
|
|
268
|
+
if not body_content:
|
|
269
|
+
raise EmptyResponseError(message=f"Empty body received from {url} (status {r.status})", response=r)
|
|
270
|
+
|
|
271
|
+
output_format = {
|
|
272
|
+
"json": lambda: json.loads(body_content),
|
|
273
|
+
"text": lambda: body_content,
|
|
274
|
+
"soup": lambda: BeautifulSoup(body_content, "html.parser"),
|
|
275
|
+
"response": lambda: r,
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
if output in output_format:
|
|
279
|
+
return output_format[output]()
|
|
280
|
+
|
|
281
|
+
raise ValueError(f"Unsupported output format: {output}")
|