datamarket 0.7.99__tar.gz → 0.7.101__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- {datamarket-0.7.99 → datamarket-0.7.101}/PKG-INFO +1 -1
- {datamarket-0.7.99 → datamarket-0.7.101}/pyproject.toml +1 -1
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/exceptions/main.py +6 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/main.py +6 -3
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/playwright/sync_api.py +37 -12
- {datamarket-0.7.99 → datamarket-0.7.101}/LICENSE +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/README.md +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/__init__.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/exceptions/__init__.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/interfaces/__init__.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/interfaces/alchemy.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/interfaces/aws.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/interfaces/azure.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/interfaces/drive.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/interfaces/ftp.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/interfaces/nominatim.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/interfaces/peerdb.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/interfaces/proxy.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/interfaces/tinybird.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/params/__init__.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/params/nominatim.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/__init__.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/airflow.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/alchemy.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/nominatim.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/playwright/__init__.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/playwright/async_api.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/selenium.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/soda.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/strings/__init__.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/strings/normalization.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/strings/obfuscation.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/strings/standardization.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/typer.py +0 -0
- {datamarket-0.7.99 → datamarket-0.7.101}/src/datamarket/utils/types.py +0 -0
|
@@ -12,3 +12,9 @@ class NotFoundError(Exception):
|
|
|
12
12
|
def __init__(self, message="Not found!"):
|
|
13
13
|
self.message = message
|
|
14
14
|
super().__init__(self.message)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BadRequestError(Exception):
|
|
18
|
+
def __init__(self, message="Bad request!"):
|
|
19
|
+
self.message = message
|
|
20
|
+
super().__init__(self.message)
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
5
|
import configparser
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
import logging
|
|
7
8
|
import random
|
|
8
9
|
import re
|
|
@@ -25,7 +26,7 @@ from tenacity import (
|
|
|
25
26
|
wait_exponential,
|
|
26
27
|
)
|
|
27
28
|
|
|
28
|
-
from ..exceptions import RedirectionDetectedError, NotFoundError
|
|
29
|
+
from ..exceptions import RedirectionDetectedError, NotFoundError, BadRequestError
|
|
29
30
|
from ..interfaces.proxy import ProxyInterface
|
|
30
31
|
|
|
31
32
|
########################################################################################################################
|
|
@@ -131,7 +132,7 @@ def parse_field(dict_struct, field_path, format_method=None):
|
|
|
131
132
|
|
|
132
133
|
|
|
133
134
|
@retry(
|
|
134
|
-
retry=retry_if_not_exception_type((NotFoundError, RedirectionDetectedError, ProxyError)),
|
|
135
|
+
retry=retry_if_not_exception_type((NotFoundError, BadRequestError, RedirectionDetectedError, ProxyError)),
|
|
135
136
|
wait=wait_exponential(exp_base=3, multiplier=3, max=60),
|
|
136
137
|
stop=stop_after_attempt(5),
|
|
137
138
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
@@ -144,7 +145,7 @@ def get_data(
|
|
|
144
145
|
sleep: tuple = (6, 3),
|
|
145
146
|
proxy_interface: ProxyInterface = None,
|
|
146
147
|
use_auth_proxies: bool = False,
|
|
147
|
-
max_proxy_delay:
|
|
148
|
+
max_proxy_delay: timedelta = timedelta(minutes=10),
|
|
148
149
|
**kwargs,
|
|
149
150
|
):
|
|
150
151
|
retry_type = retry_if_exception_type(ProxyError)
|
|
@@ -175,6 +176,8 @@ def get_data(
|
|
|
175
176
|
|
|
176
177
|
if r.status_code == 404:
|
|
177
178
|
raise NotFoundError(f"404 Not Found error for {url}")
|
|
179
|
+
if r.status_code == 400:
|
|
180
|
+
raise BadRequestError(f"400 Bad Request error for {url}")
|
|
178
181
|
r.raise_for_status()
|
|
179
182
|
r.encoding = "utf-8"
|
|
180
183
|
|
|
@@ -53,12 +53,14 @@ def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_
|
|
|
53
53
|
class PlaywrightCrawler:
|
|
54
54
|
"""A robust, proxy-enabled Playwright crawler with captcha bypass and retry logic."""
|
|
55
55
|
|
|
56
|
-
def __init__(self, proxy_interface: ProxyInterface):
|
|
56
|
+
def __init__(self, proxy_interface: Optional[ProxyInterface] = None):
|
|
57
57
|
"""
|
|
58
|
-
Initializes the crawler
|
|
58
|
+
Initializes the crawler.
|
|
59
59
|
|
|
60
60
|
Args:
|
|
61
|
-
proxy_interface (ProxyInterface):
|
|
61
|
+
proxy_interface (Optional[ProxyInterface], optional): Provider used to fetch
|
|
62
|
+
proxy credentials. Defaults to None. When None, no proxy is configured and
|
|
63
|
+
the browser will run without a proxy.
|
|
62
64
|
"""
|
|
63
65
|
self.proxy_interface = proxy_interface
|
|
64
66
|
self.pw: Optional[Camoufox] = None
|
|
@@ -81,6 +83,25 @@ class PlaywrightCrawler:
|
|
|
81
83
|
if self.pw:
|
|
82
84
|
self.pw.__exit__(exc_type, exc_val, exc_tb)
|
|
83
85
|
|
|
86
|
+
def _build_proxy_config(self) -> Optional[dict]:
|
|
87
|
+
"""Builds the proxy configuration dictionary.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Optional[dict]: Proxy configuration if a proxy_interface is provided; otherwise None.
|
|
91
|
+
"""
|
|
92
|
+
if not self.proxy_interface:
|
|
93
|
+
logger.info("Starting browser without proxy.")
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
|
|
97
|
+
proxy_url = f"http://{host}:{port}"
|
|
98
|
+
proxy_cfg: dict = {"server": proxy_url}
|
|
99
|
+
if user and pwd:
|
|
100
|
+
proxy_cfg.update({"username": user, "password": pwd})
|
|
101
|
+
|
|
102
|
+
logger.info(f"Starting browser with proxy: {proxy_url}")
|
|
103
|
+
return proxy_cfg
|
|
104
|
+
|
|
84
105
|
@retry(
|
|
85
106
|
wait=wait_exponential(exp_base=2, multiplier=3, max=90),
|
|
86
107
|
stop=stop_after_delay(timedelta(minutes=10)),
|
|
@@ -88,16 +109,20 @@ class PlaywrightCrawler:
|
|
|
88
109
|
reraise=True,
|
|
89
110
|
)
|
|
90
111
|
def init_context(self) -> Self:
|
|
91
|
-
"""
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
112
|
+
"""
|
|
113
|
+
Initializes a new browser instance and context.
|
|
114
|
+
|
|
115
|
+
Behavior:
|
|
116
|
+
- If a proxy_interface is provided, fetches fresh proxy credentials and starts
|
|
117
|
+
the browser using that proxy.
|
|
118
|
+
- If proxy_interface is None, starts the browser without any proxy.
|
|
96
119
|
|
|
97
|
-
|
|
98
|
-
|
|
120
|
+
Returns:
|
|
121
|
+
Self: The crawler instance with active browser, context, and page.
|
|
122
|
+
"""
|
|
123
|
+
try:
|
|
124
|
+
proxy_cfg: Optional[dict] = self._build_proxy_config()
|
|
99
125
|
|
|
100
|
-
logger.info(f"Starting browser with proxy: {proxy_url}")
|
|
101
126
|
self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
|
|
102
127
|
self.browser = self.pw.__enter__()
|
|
103
128
|
self.context = self.browser.new_context()
|
|
@@ -146,4 +171,4 @@ class PlaywrightCrawler:
|
|
|
146
171
|
if not self.page:
|
|
147
172
|
logger.info("Browser context not found, initializing now...")
|
|
148
173
|
self.init_context()
|
|
149
|
-
return self._goto_with_retry(url)
|
|
174
|
+
return self._goto_with_retry(url)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|