datamarket 0.7.99__py3-none-any.whl → 0.7.100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/utils/playwright/sync_api.py +37 -12
- {datamarket-0.7.99.dist-info → datamarket-0.7.100.dist-info}/METADATA +1 -1
- {datamarket-0.7.99.dist-info → datamarket-0.7.100.dist-info}/RECORD +5 -5
- {datamarket-0.7.99.dist-info → datamarket-0.7.100.dist-info}/LICENSE +0 -0
- {datamarket-0.7.99.dist-info → datamarket-0.7.100.dist-info}/WHEEL +0 -0
|
@@ -53,12 +53,14 @@ def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_
|
|
|
53
53
|
class PlaywrightCrawler:
|
|
54
54
|
"""A robust, proxy-enabled Playwright crawler with captcha bypass and retry logic."""
|
|
55
55
|
|
|
56
|
-
def __init__(self, proxy_interface: ProxyInterface):
|
|
56
|
+
def __init__(self, proxy_interface: Optional[ProxyInterface] = None):
|
|
57
57
|
"""
|
|
58
|
-
Initializes the crawler
|
|
58
|
+
Initializes the crawler.
|
|
59
59
|
|
|
60
60
|
Args:
|
|
61
|
-
proxy_interface (ProxyInterface):
|
|
61
|
+
proxy_interface (Optional[ProxyInterface], optional): Provider used to fetch
|
|
62
|
+
proxy credentials. Defaults to None. When None, no proxy is configured and
|
|
63
|
+
the browser will run without a proxy.
|
|
62
64
|
"""
|
|
63
65
|
self.proxy_interface = proxy_interface
|
|
64
66
|
self.pw: Optional[Camoufox] = None
|
|
@@ -81,6 +83,25 @@ class PlaywrightCrawler:
|
|
|
81
83
|
if self.pw:
|
|
82
84
|
self.pw.__exit__(exc_type, exc_val, exc_tb)
|
|
83
85
|
|
|
86
|
+
def _build_proxy_config(self) -> Optional[dict]:
|
|
87
|
+
"""Builds the proxy configuration dictionary.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Optional[dict]: Proxy configuration if a proxy_interface is provided; otherwise None.
|
|
91
|
+
"""
|
|
92
|
+
if not self.proxy_interface:
|
|
93
|
+
logger.info("Starting browser without proxy.")
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
|
|
97
|
+
proxy_url = f"http://{host}:{port}"
|
|
98
|
+
proxy_cfg: dict = {"server": proxy_url}
|
|
99
|
+
if user and pwd:
|
|
100
|
+
proxy_cfg.update({"username": user, "password": pwd})
|
|
101
|
+
|
|
102
|
+
logger.info(f"Starting browser with proxy: {proxy_url}")
|
|
103
|
+
return proxy_cfg
|
|
104
|
+
|
|
84
105
|
@retry(
|
|
85
106
|
wait=wait_exponential(exp_base=2, multiplier=3, max=90),
|
|
86
107
|
stop=stop_after_delay(timedelta(minutes=10)),
|
|
@@ -88,16 +109,20 @@ class PlaywrightCrawler:
|
|
|
88
109
|
reraise=True,
|
|
89
110
|
)
|
|
90
111
|
def init_context(self) -> Self:
|
|
91
|
-
"""
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
112
|
+
"""
|
|
113
|
+
Initializes a new browser instance and context.
|
|
114
|
+
|
|
115
|
+
Behavior:
|
|
116
|
+
- If a proxy_interface is provided, fetches fresh proxy credentials and starts
|
|
117
|
+
the browser using that proxy.
|
|
118
|
+
- If proxy_interface is None, starts the browser without any proxy.
|
|
96
119
|
|
|
97
|
-
|
|
98
|
-
|
|
120
|
+
Returns:
|
|
121
|
+
Self: The crawler instance with active browser, context, and page.
|
|
122
|
+
"""
|
|
123
|
+
try:
|
|
124
|
+
proxy_cfg: Optional[dict] = self._build_proxy_config()
|
|
99
125
|
|
|
100
|
-
logger.info(f"Starting browser with proxy: {proxy_url}")
|
|
101
126
|
self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
|
|
102
127
|
self.browser = self.pw.__enter__()
|
|
103
128
|
self.context = self.browser.new_context()
|
|
@@ -146,4 +171,4 @@ class PlaywrightCrawler:
|
|
|
146
171
|
if not self.page:
|
|
147
172
|
logger.info("Browser context not found, initializing now...")
|
|
148
173
|
self.init_context()
|
|
149
|
-
return self._goto_with_retry(url)
|
|
174
|
+
return self._goto_with_retry(url)
|
|
@@ -20,7 +20,7 @@ datamarket/utils/main.py,sha256=KYHjDOps6_Q3TFV_Jj7MLj-L9Evx05AXELCvp06BARU,5857
|
|
|
20
20
|
datamarket/utils/nominatim.py,sha256=IxexKY2KOlDhiKtzsqQfoVUjJXPxJl7tn3iHUaQKg08,5795
|
|
21
21
|
datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
datamarket/utils/playwright/async_api.py,sha256=UbA2D4ScBtYeMfrRjly4RO-s8wXIub9c05J1eoOCpsQ,5782
|
|
23
|
-
datamarket/utils/playwright/sync_api.py,sha256=
|
|
23
|
+
datamarket/utils/playwright/sync_api.py,sha256=eXaZsd7xgWSYJtZv6EAstjSbS2bl9OYlkwMBfqqTbFY,6434
|
|
24
24
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
25
25
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
26
26
|
datamarket/utils/strings/__init__.py,sha256=b6TYOT9v7y9ID-lDyZk4E8BH2uIPbsF2ZSLGjCQ1MCQ,43
|
|
@@ -29,7 +29,7 @@ datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnm
|
|
|
29
29
|
datamarket/utils/strings/standardization.py,sha256=c8CAG6HI3AfK0hB3A3IGwsbnQebZ6R3PrA5PELHRXM0,1492
|
|
30
30
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
31
31
|
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
32
|
-
datamarket-0.7.
|
|
33
|
-
datamarket-0.7.
|
|
34
|
-
datamarket-0.7.
|
|
35
|
-
datamarket-0.7.
|
|
32
|
+
datamarket-0.7.100.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
33
|
+
datamarket-0.7.100.dist-info/METADATA,sha256=ZzGfCV51bIyPYJVdCSfJDdX8YuC9_BjKR1VCoRtd6yI,7382
|
|
34
|
+
datamarket-0.7.100.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
35
|
+
datamarket-0.7.100.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|