datamarket 0.9.49__py3-none-any.whl → 0.9.51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/utils/playwright/async_api.py +136 -7
- datamarket/utils/playwright/sync_api.py +134 -8
- datamarket/utils/strings/normalization.py +11 -4
- {datamarket-0.9.49.dist-info → datamarket-0.9.51.dist-info}/METADATA +1 -1
- {datamarket-0.9.49.dist-info → datamarket-0.9.51.dist-info}/RECORD +7 -7
- {datamarket-0.9.49.dist-info → datamarket-0.9.51.dist-info}/LICENSE +0 -0
- {datamarket-0.9.49.dist-info → datamarket-0.9.51.dist-info}/WHEEL +0 -0
|
@@ -2,22 +2,151 @@
|
|
|
2
2
|
# IMPORTS
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import timedelta
|
|
5
7
|
from random import randint
|
|
8
|
+
from types import TracebackType
|
|
9
|
+
from typing import Optional, Self
|
|
6
10
|
|
|
7
|
-
|
|
11
|
+
# 'BdbQuit' import is removed as it's no longer used
|
|
12
|
+
from camoufox.async_api import AsyncCamoufox as Camoufox
|
|
13
|
+
from playwright.async_api import (
|
|
14
|
+
Browser,
|
|
15
|
+
BrowserContext,
|
|
16
|
+
Error as PlaywrightError,
|
|
17
|
+
Page,
|
|
18
|
+
TimeoutError as PlaywrightTimeoutError,
|
|
19
|
+
)
|
|
20
|
+
from tenacity import (
|
|
21
|
+
before_sleep_log,
|
|
22
|
+
retry,
|
|
23
|
+
retry_if_exception_type,
|
|
24
|
+
stop_after_delay,
|
|
25
|
+
wait_exponential,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
from datamarket.interfaces.proxy import ProxyInterface
|
|
8
29
|
|
|
9
30
|
|
|
10
31
|
########################################################################################################################
|
|
11
|
-
#
|
|
32
|
+
# SETUP LOGGER
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
########################################################################################################################
|
|
37
|
+
# ASYNC HELPER FUNCTIONS
|
|
12
38
|
|
|
13
39
|
|
|
14
40
|
async def human_type(page: Page, text: str, delay: int = 100):
|
|
15
41
|
for char in text:
|
|
16
|
-
await page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5)))
|
|
42
|
+
await page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5)))
|
|
17
43
|
|
|
18
44
|
|
|
19
|
-
async def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100,
|
|
45
|
+
async def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_sleep: bool = True) -> None:
|
|
46
|
+
"""Asynchronously presses a key with a random delay, optionally sleeping between presses."""
|
|
20
47
|
for _ in range(count):
|
|
21
|
-
await page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5)))
|
|
22
|
-
if
|
|
23
|
-
await asyncio.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000)
|
|
48
|
+
await page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5)))
|
|
49
|
+
if add_sleep:
|
|
50
|
+
await asyncio.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
########################################################################################################################
|
|
54
|
+
# ASYNC CRAWLER CLASS
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class PlaywrightCrawler:
|
|
58
|
+
"""An robust, proxy-enabled asynchronous Playwright crawler with captcha bypass and retry logic."""
|
|
59
|
+
|
|
60
|
+
def __init__(self, proxy_interface: ProxyInterface):
|
|
61
|
+
"""
|
|
62
|
+
Initializes the async crawler with a proxy interface.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
proxy_interface (ProxyInterface): An async-compatible object to fetch proxy credentials.
|
|
66
|
+
"""
|
|
67
|
+
self.proxy_interface = proxy_interface
|
|
68
|
+
self.pw: Optional[Camoufox] = None
|
|
69
|
+
self.browser: Optional[Browser] = None
|
|
70
|
+
self.context: Optional[BrowserContext] = None
|
|
71
|
+
self.page: Optional[Page] = None
|
|
72
|
+
|
|
73
|
+
async def __aenter__(self) -> Self:
|
|
74
|
+
"""Initializes the browser context when entering the `async with` statement."""
|
|
75
|
+
await self.init_context()
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
async def __aexit__(
|
|
79
|
+
self,
|
|
80
|
+
exc_type: Optional[type[BaseException]],
|
|
81
|
+
exc_val: Optional[BaseException],
|
|
82
|
+
exc_tb: Optional[TracebackType],
|
|
83
|
+
) -> None:
|
|
84
|
+
"""Safely closes the browser context upon exit."""
|
|
85
|
+
if self.pw:
|
|
86
|
+
await self.pw.__aexit__(exc_type, exc_val, exc_tb)
|
|
87
|
+
|
|
88
|
+
@retry(
|
|
89
|
+
wait=wait_exponential(exp_base=2, multiplier=3, max=90),
|
|
90
|
+
stop=stop_after_delay(timedelta(minutes=10)),
|
|
91
|
+
before_sleep=before_sleep_log(logger, logging.INFO),
|
|
92
|
+
reraise=True,
|
|
93
|
+
)
|
|
94
|
+
async def init_context(self) -> Self:
|
|
95
|
+
"""Initializes a new async browser instance and context with a fresh proxy."""
|
|
96
|
+
try:
|
|
97
|
+
# Correctly wrap the blocking I/O call
|
|
98
|
+
host, port, user, pwd = await asyncio.to_thread(self.proxy_interface.get_proxies, raw=True, use_auth=True)
|
|
99
|
+
proxy_url = f"http://{host}:{port}"
|
|
100
|
+
proxy_cfg = {"server": proxy_url}
|
|
101
|
+
|
|
102
|
+
if user and pwd:
|
|
103
|
+
proxy_cfg.update({"username": user, "password": pwd})
|
|
104
|
+
|
|
105
|
+
logger.info(f"Starting browser with proxy: {proxy_url}")
|
|
106
|
+
self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
|
|
107
|
+
self.browser = await self.pw.__aenter__()
|
|
108
|
+
self.context = await self.browser.new_context()
|
|
109
|
+
self.page = await self.context.new_page()
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.error(f"Failed to initialize browser context: {e}")
|
|
112
|
+
if self.pw:
|
|
113
|
+
await self.pw.__aexit__(type(e), e, e.__traceback__)
|
|
114
|
+
raise
|
|
115
|
+
return self
|
|
116
|
+
|
|
117
|
+
async def restart_context(self) -> None:
|
|
118
|
+
"""Closes the current browser instance and initializes a new one."""
|
|
119
|
+
logger.info("Restarting browser context...")
|
|
120
|
+
if self.pw:
|
|
121
|
+
await self.pw.__aexit__(None, None, None)
|
|
122
|
+
await self.init_context()
|
|
123
|
+
|
|
124
|
+
@retry(
|
|
125
|
+
retry=retry_if_exception_type((PlaywrightTimeoutError, PlaywrightError)),
|
|
126
|
+
wait=wait_exponential(exp_base=2, multiplier=3, max=90),
|
|
127
|
+
stop=stop_after_delay(timedelta(minutes=10)),
|
|
128
|
+
before_sleep=before_sleep_log(logger, logging.INFO),
|
|
129
|
+
reraise=True,
|
|
130
|
+
)
|
|
131
|
+
async def _goto_with_retry(self, url: str) -> Page:
|
|
132
|
+
"""
|
|
133
|
+
Asynchronously navigates to a URL with retries for common Playwright errors.
|
|
134
|
+
Restarts the browser context on repeated failures.
|
|
135
|
+
"""
|
|
136
|
+
if not (self.page and not self.page.is_closed()):
|
|
137
|
+
logger.warning("Page is not available or closed. Restarting context.")
|
|
138
|
+
await self.restart_context()
|
|
139
|
+
|
|
140
|
+
assert self.page is not None
|
|
141
|
+
await self.page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
|
142
|
+
return self.page
|
|
143
|
+
|
|
144
|
+
async def goto(self, url: str) -> Page:
|
|
145
|
+
"""
|
|
146
|
+
Ensures the browser is initialized and navigates to the given URL.
|
|
147
|
+
Public wrapper for the internal retry-enabled navigation method.
|
|
148
|
+
"""
|
|
149
|
+
if not self.page:
|
|
150
|
+
logger.info("Browser context not found, initializing now...")
|
|
151
|
+
await self.init_context()
|
|
152
|
+
return await self._goto_with_retry(url)
|
|
@@ -1,23 +1,149 @@
|
|
|
1
1
|
########################################################################################################################
|
|
2
2
|
# IMPORTS
|
|
3
3
|
|
|
4
|
+
import logging
|
|
4
5
|
import time
|
|
6
|
+
from datetime import timedelta
|
|
5
7
|
from random import randint
|
|
8
|
+
from types import TracebackType
|
|
9
|
+
from typing import Optional, Self
|
|
6
10
|
|
|
7
|
-
from
|
|
8
|
-
|
|
11
|
+
from camoufox import Camoufox
|
|
12
|
+
from playwright.sync_api import (
|
|
13
|
+
Browser,
|
|
14
|
+
BrowserContext,
|
|
15
|
+
Error as PlaywrightError,
|
|
16
|
+
Page,
|
|
17
|
+
TimeoutError as PlaywrightTimeoutError,
|
|
18
|
+
)
|
|
19
|
+
from tenacity import (
|
|
20
|
+
before_sleep_log,
|
|
21
|
+
retry,
|
|
22
|
+
retry_if_exception_type,
|
|
23
|
+
stop_after_delay,
|
|
24
|
+
wait_exponential,
|
|
25
|
+
)
|
|
26
|
+
from datamarket.interfaces.proxy import ProxyInterface
|
|
9
27
|
|
|
10
28
|
########################################################################################################################
|
|
11
|
-
#
|
|
29
|
+
# SETUP LOGGER
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
12
32
|
|
|
13
33
|
|
|
34
|
+
########################################################################################################################
|
|
35
|
+
# HELPER FUNCTIONS
|
|
14
36
|
def human_type(page: Page, text: str, delay: int = 100):
|
|
15
37
|
for char in text:
|
|
16
|
-
page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5)))
|
|
38
|
+
page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5)))
|
|
17
39
|
|
|
18
40
|
|
|
19
|
-
def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100,
|
|
41
|
+
def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_sleep: bool = True) -> None:
|
|
42
|
+
"""Presses a key with a random delay, optionally sleeping between presses."""
|
|
20
43
|
for _ in range(count):
|
|
21
|
-
page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5)))
|
|
22
|
-
if
|
|
23
|
-
time.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000)
|
|
44
|
+
page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5)))
|
|
45
|
+
if add_sleep:
|
|
46
|
+
time.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
########################################################################################################################
|
|
50
|
+
# CRAWLER CLASS
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class PlaywrightCrawler:
|
|
54
|
+
"""A robust, proxy-enabled Playwright crawler with captcha bypass and retry logic."""
|
|
55
|
+
|
|
56
|
+
def __init__(self, proxy_interface: ProxyInterface):
|
|
57
|
+
"""
|
|
58
|
+
Initializes the crawler with a proxy interface.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
proxy_interface (ProxyInterface): An object to fetch proxy credentials.
|
|
62
|
+
"""
|
|
63
|
+
self.proxy_interface = proxy_interface
|
|
64
|
+
self.pw: Optional[Camoufox] = None
|
|
65
|
+
self.browser: Optional[Browser] = None
|
|
66
|
+
self.context: Optional[BrowserContext] = None
|
|
67
|
+
self.page: Optional[Page] = None
|
|
68
|
+
|
|
69
|
+
def __enter__(self) -> Self:
|
|
70
|
+
"""Initializes the browser context when entering the `with` statement."""
|
|
71
|
+
self.init_context()
|
|
72
|
+
return self
|
|
73
|
+
|
|
74
|
+
def __exit__(
|
|
75
|
+
self,
|
|
76
|
+
exc_type: Optional[type[BaseException]],
|
|
77
|
+
exc_val: Optional[BaseException],
|
|
78
|
+
exc_tb: Optional[TracebackType],
|
|
79
|
+
) -> None:
|
|
80
|
+
"""Safely closes the browser context upon exit."""
|
|
81
|
+
if self.pw:
|
|
82
|
+
self.pw.__exit__(exc_type, exc_val, exc_tb)
|
|
83
|
+
|
|
84
|
+
@retry(
|
|
85
|
+
wait=wait_exponential(exp_base=2, multiplier=3, max=90),
|
|
86
|
+
stop=stop_after_delay(timedelta(minutes=10)),
|
|
87
|
+
before_sleep=before_sleep_log(logger, logging.INFO),
|
|
88
|
+
reraise=True,
|
|
89
|
+
)
|
|
90
|
+
def init_context(self) -> Self:
|
|
91
|
+
"""Initializes a new browser instance and context with a fresh proxy."""
|
|
92
|
+
try:
|
|
93
|
+
host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
|
|
94
|
+
proxy_url = f"http://{host}:{port}"
|
|
95
|
+
proxy_cfg = {"server": proxy_url}
|
|
96
|
+
|
|
97
|
+
if user and pwd:
|
|
98
|
+
proxy_cfg.update({"username": user, "password": pwd})
|
|
99
|
+
|
|
100
|
+
logger.info(f"Starting browser with proxy: {proxy_url}")
|
|
101
|
+
self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
|
|
102
|
+
self.browser = self.pw.__enter__()
|
|
103
|
+
self.context = self.browser.new_context()
|
|
104
|
+
self.page = self.context.new_page()
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.error(f"Failed to initialize browser context: {e}")
|
|
107
|
+
if self.pw:
|
|
108
|
+
self.pw.__exit__(type(e), e, e.__traceback__)
|
|
109
|
+
raise
|
|
110
|
+
return self
|
|
111
|
+
|
|
112
|
+
def restart_context(self) -> None:
|
|
113
|
+
"""Closes the current browser instance and initializes a new one."""
|
|
114
|
+
logger.info("Restarting browser context...")
|
|
115
|
+
if self.pw:
|
|
116
|
+
self.pw.__exit__(None, None, None)
|
|
117
|
+
self.init_context()
|
|
118
|
+
|
|
119
|
+
@retry(
|
|
120
|
+
retry=retry_if_exception_type((PlaywrightTimeoutError, PlaywrightError)),
|
|
121
|
+
wait=wait_exponential(exp_base=2, multiplier=3, max=90),
|
|
122
|
+
stop=stop_after_delay(timedelta(minutes=10)),
|
|
123
|
+
before_sleep=before_sleep_log(logger, logging.INFO),
|
|
124
|
+
before=lambda rs: rs.args[0].restart_context() if rs.attempt_number > 1 else None,
|
|
125
|
+
reraise=True,
|
|
126
|
+
)
|
|
127
|
+
def _goto_with_retry(self, url: str) -> Page:
|
|
128
|
+
"""
|
|
129
|
+
Navigates to a URL with retries for common Playwright errors.
|
|
130
|
+
Restarts the browser context on repeated failures.
|
|
131
|
+
"""
|
|
132
|
+
if not (self.page and not self.page.is_closed()):
|
|
133
|
+
logger.warning("Page is not available or closed. Restarting context.")
|
|
134
|
+
self.restart_context()
|
|
135
|
+
|
|
136
|
+
# self.page is guaranteed to be valid here by the logic above
|
|
137
|
+
assert self.page is not None
|
|
138
|
+
self.page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
|
139
|
+
return self.page
|
|
140
|
+
|
|
141
|
+
def goto(self, url: str) -> Page:
|
|
142
|
+
"""
|
|
143
|
+
Ensures the browser is initialized and navigates to the given URL.
|
|
144
|
+
Public wrapper for the internal retry-enabled navigation method.
|
|
145
|
+
"""
|
|
146
|
+
if not self.page:
|
|
147
|
+
logger.info("Browser context not found, initializing now...")
|
|
148
|
+
self.init_context()
|
|
149
|
+
return self._goto_with_retry(url)
|
|
@@ -137,8 +137,13 @@ def normalize(
|
|
|
137
137
|
# Parameter mapping
|
|
138
138
|
if isinstance(mode, str):
|
|
139
139
|
mode = NormalizationMode[mode.upper()]
|
|
140
|
+
if not isinstance(mode, NormalizationMode):
|
|
141
|
+
raise TypeError("mode must be NormalizationMode or str")
|
|
142
|
+
|
|
140
143
|
if isinstance(naming, str):
|
|
141
144
|
naming = NamingConvention[naming.upper()]
|
|
145
|
+
if not isinstance(naming, NamingConvention):
|
|
146
|
+
raise TypeError("naming must be NamingConvention or str")
|
|
142
147
|
|
|
143
148
|
_allowed_symbols_set: Set[str] = set(allowed_symbols) if allowed_symbols else set()
|
|
144
149
|
|
|
@@ -148,7 +153,11 @@ def normalize(
|
|
|
148
153
|
elif not isinstance(s, str):
|
|
149
154
|
return str(s)
|
|
150
155
|
else:
|
|
151
|
-
|
|
156
|
+
raw_text = str(s)
|
|
157
|
+
if naming is NamingConvention.NONE:
|
|
158
|
+
text = raw_text
|
|
159
|
+
else:
|
|
160
|
+
text = prettify(strip_html(raw_text, True))
|
|
152
161
|
|
|
153
162
|
if mode is NormalizationMode.NONE:
|
|
154
163
|
normalized = text
|
|
@@ -170,9 +179,7 @@ def normalize(
|
|
|
170
179
|
|
|
171
180
|
for c in intermediate_text:
|
|
172
181
|
cat = unicodedata.category(c)
|
|
173
|
-
if c in _allowed_symbols_set: # Allowed symbols are part of tokens
|
|
174
|
-
current_token_chars.append(c)
|
|
175
|
-
elif c.isalnum():
|
|
182
|
+
if c in _allowed_symbols_set or c.isalnum(): # Allowed symbols are part of tokens
|
|
176
183
|
current_token_chars.append(c)
|
|
177
184
|
elif mode is NormalizationMode.FULL and cat.startswith("S"):
|
|
178
185
|
# Transliterate S* category symbols not in allowed_symbols
|
|
@@ -17,16 +17,16 @@ datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,7
|
|
|
17
17
|
datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
|
|
18
18
|
datamarket/utils/main.py,sha256=DMMgkQnMS6fNziTru8FM9z2ERfYfkdR9qFPF7s6sp7U,9657
|
|
19
19
|
datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
-
datamarket/utils/playwright/async_api.py,sha256=
|
|
21
|
-
datamarket/utils/playwright/sync_api.py,sha256=
|
|
20
|
+
datamarket/utils/playwright/async_api.py,sha256=UbA2D4ScBtYeMfrRjly4RO-s8wXIub9c05J1eoOCpsQ,5782
|
|
21
|
+
datamarket/utils/playwright/sync_api.py,sha256=Tw_-KLB3vipFuEQwcX8iCbj7giCzcwXB-bhl_ncR-2Q,5542
|
|
22
22
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
23
23
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
24
24
|
datamarket/utils/strings/__init__.py,sha256=b6TYOT9v7y9ID-lDyZk4E8BH2uIPbsF2ZSLGjCQ1MCQ,43
|
|
25
|
-
datamarket/utils/strings/normalization.py,sha256=
|
|
25
|
+
datamarket/utils/strings/normalization.py,sha256=rj0wfJSjqcCRp-ruHqc5pylO3_TOmY5_V1lKzkyWoAA,8991
|
|
26
26
|
datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
|
|
27
27
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
28
28
|
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
29
|
-
datamarket-0.9.
|
|
30
|
-
datamarket-0.9.
|
|
31
|
-
datamarket-0.9.
|
|
32
|
-
datamarket-0.9.
|
|
29
|
+
datamarket-0.9.51.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
30
|
+
datamarket-0.9.51.dist-info/METADATA,sha256=UPV2cxDlddvKHiKY5tSt-dDkA7reLhyIX1KAIfcjxag,7326
|
|
31
|
+
datamarket-0.9.51.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
32
|
+
datamarket-0.9.51.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|