datamarket 0.9.49__tar.gz → 0.9.50__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (34) hide show
  1. {datamarket-0.9.49 → datamarket-0.9.50}/PKG-INFO +1 -1
  2. {datamarket-0.9.49 → datamarket-0.9.50}/pyproject.toml +1 -1
  3. datamarket-0.9.50/src/datamarket/utils/playwright/async_api.py +152 -0
  4. datamarket-0.9.50/src/datamarket/utils/playwright/sync_api.py +149 -0
  5. datamarket-0.9.49/src/datamarket/utils/playwright/async_api.py +0 -23
  6. datamarket-0.9.49/src/datamarket/utils/playwright/sync_api.py +0 -23
  7. {datamarket-0.9.49 → datamarket-0.9.50}/LICENSE +0 -0
  8. {datamarket-0.9.49 → datamarket-0.9.50}/README.md +0 -0
  9. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/__init__.py +0 -0
  10. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/exceptions/__init__.py +0 -0
  11. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/exceptions/main.py +0 -0
  12. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/interfaces/__init__.py +0 -0
  13. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/interfaces/alchemy.py +0 -0
  14. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/interfaces/aws.py +0 -0
  15. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/interfaces/drive.py +0 -0
  16. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/interfaces/ftp.py +0 -0
  17. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/interfaces/nominatim.py +0 -0
  18. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/interfaces/peerdb.py +0 -0
  19. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/interfaces/proxy.py +0 -0
  20. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/interfaces/tinybird.py +0 -0
  21. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/params/__init__.py +0 -0
  22. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/params/nominatim.py +0 -0
  23. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/__init__.py +0 -0
  24. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/airflow.py +0 -0
  25. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/alchemy.py +0 -0
  26. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/main.py +0 -0
  27. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/playwright/__init__.py +0 -0
  28. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/selenium.py +0 -0
  29. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/soda.py +0 -0
  30. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/strings/__init__.py +0 -0
  31. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/strings/normalization.py +0 -0
  32. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/strings/obfuscation.py +0 -0
  33. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/typer.py +0 -0
  34. {datamarket-0.9.49 → datamarket-0.9.50}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.49
3
+ Version: 0.9.50
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.49"
3
+ version = "0.9.50"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -0,0 +1,152 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import asyncio
5
+ import logging
6
+ from datetime import timedelta
7
+ from random import randint
8
+ from types import TracebackType
9
+ from typing import Optional, Self
10
+
11
+ # 'BdbQuit' import is removed as it's no longer used
12
+ from camoufox.async_api import AsyncCamoufox as Camoufox
13
+ from playwright.async_api import (
14
+ Browser,
15
+ BrowserContext,
16
+ Error as PlaywrightError,
17
+ Page,
18
+ TimeoutError as PlaywrightTimeoutError,
19
+ )
20
+ from tenacity import (
21
+ before_sleep_log,
22
+ retry,
23
+ retry_if_exception_type,
24
+ stop_after_delay,
25
+ wait_exponential,
26
+ )
27
+
28
+ from datamarket.interfaces.proxy import ProxyInterface
29
+
30
+
31
+ ########################################################################################################################
32
+ # SETUP LOGGER
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ ########################################################################################################################
37
+ # ASYNC HELPER FUNCTIONS
38
+
39
+
40
+ async def human_type(page: Page, text: str, delay: int = 100):
41
+ for char in text:
42
+ await page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5)))
43
+
44
+
45
+ async def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_sleep: bool = True) -> None:
46
+ """Asynchronously presses a key with a random delay, optionally sleeping between presses."""
47
+ for _ in range(count):
48
+ await page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5)))
49
+ if add_sleep:
50
+ await asyncio.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000)
51
+
52
+
53
+ ########################################################################################################################
54
+ # ASYNC CRAWLER CLASS
55
+
56
+
57
+ class PlaywrightCrawler:
58
+ """An robust, proxy-enabled asynchronous Playwright crawler with captcha bypass and retry logic."""
59
+
60
+ def __init__(self, proxy_interface: ProxyInterface):
61
+ """
62
+ Initializes the async crawler with a proxy interface.
63
+
64
+ Args:
65
+ proxy_interface (ProxyInterface): An async-compatible object to fetch proxy credentials.
66
+ """
67
+ self.proxy_interface = proxy_interface
68
+ self.pw: Optional[Camoufox] = None
69
+ self.browser: Optional[Browser] = None
70
+ self.context: Optional[BrowserContext] = None
71
+ self.page: Optional[Page] = None
72
+
73
+ async def __aenter__(self) -> Self:
74
+ """Initializes the browser context when entering the `async with` statement."""
75
+ await self.init_context()
76
+ return self
77
+
78
+ async def __aexit__(
79
+ self,
80
+ exc_type: Optional[type[BaseException]],
81
+ exc_val: Optional[BaseException],
82
+ exc_tb: Optional[TracebackType],
83
+ ) -> None:
84
+ """Safely closes the browser context upon exit."""
85
+ if self.pw:
86
+ await self.pw.__aexit__(exc_type, exc_val, exc_tb)
87
+
88
+ @retry(
89
+ wait=wait_exponential(exp_base=2, multiplier=3, max=90),
90
+ stop=stop_after_delay(timedelta(minutes=10)),
91
+ before_sleep=before_sleep_log(logger, logging.INFO),
92
+ reraise=True,
93
+ )
94
+ async def init_context(self) -> Self:
95
+ """Initializes a new async browser instance and context with a fresh proxy."""
96
+ try:
97
+ # Correctly wrap the blocking I/O call
98
+ host, port, user, pwd = await asyncio.to_thread(self.proxy_interface.get_proxies, raw=True, use_auth=True)
99
+ proxy_url = f"http://{host}:{port}"
100
+ proxy_cfg = {"server": proxy_url}
101
+
102
+ if user and pwd:
103
+ proxy_cfg.update({"username": user, "password": pwd})
104
+
105
+ logger.info(f"Starting browser with proxy: {proxy_url}")
106
+ self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
107
+ self.browser = await self.pw.__aenter__()
108
+ self.context = await self.browser.new_context()
109
+ self.page = await self.context.new_page()
110
+ except Exception as e:
111
+ logger.error(f"Failed to initialize browser context: {e}")
112
+ if self.pw:
113
+ await self.pw.__aexit__(type(e), e, e.__traceback__)
114
+ raise
115
+ return self
116
+
117
+ async def restart_context(self) -> None:
118
+ """Closes the current browser instance and initializes a new one."""
119
+ logger.info("Restarting browser context...")
120
+ if self.pw:
121
+ await self.pw.__aexit__(None, None, None)
122
+ await self.init_context()
123
+
124
+ @retry(
125
+ retry=retry_if_exception_type((PlaywrightTimeoutError, PlaywrightError)),
126
+ wait=wait_exponential(exp_base=2, multiplier=3, max=90),
127
+ stop=stop_after_delay(timedelta(minutes=10)),
128
+ before_sleep=before_sleep_log(logger, logging.INFO),
129
+ reraise=True,
130
+ )
131
+ async def _goto_with_retry(self, url: str) -> Page:
132
+ """
133
+ Asynchronously navigates to a URL with retries for common Playwright errors.
134
+ Restarts the browser context on repeated failures.
135
+ """
136
+ if not (self.page and not self.page.is_closed()):
137
+ logger.warning("Page is not available or closed. Restarting context.")
138
+ await self.restart_context()
139
+
140
+ assert self.page is not None
141
+ await self.page.goto(url, timeout=30000, wait_until="domcontentloaded")
142
+ return self.page
143
+
144
+ async def goto(self, url: str) -> Page:
145
+ """
146
+ Ensures the browser is initialized and navigates to the given URL.
147
+ Public wrapper for the internal retry-enabled navigation method.
148
+ """
149
+ if not self.page:
150
+ logger.info("Browser context not found, initializing now...")
151
+ await self.init_context()
152
+ return await self._goto_with_retry(url)
@@ -0,0 +1,149 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import logging
5
+ import time
6
+ from datetime import timedelta
7
+ from random import randint
8
+ from types import TracebackType
9
+ from typing import Optional, Self
10
+
11
+ from camoufox import Camoufox
12
+ from playwright.sync_api import (
13
+ Browser,
14
+ BrowserContext,
15
+ Error as PlaywrightError,
16
+ Page,
17
+ TimeoutError as PlaywrightTimeoutError,
18
+ )
19
+ from tenacity import (
20
+ before_sleep_log,
21
+ retry,
22
+ retry_if_exception_type,
23
+ stop_after_delay,
24
+ wait_exponential,
25
+ )
26
+ from datamarket.interfaces.proxy import ProxyInterface
27
+
28
+ ########################################################################################################################
29
+ # SETUP LOGGER
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ ########################################################################################################################
35
+ # HELPER FUNCTIONS
36
+ def human_type(page: Page, text: str, delay: int = 100):
37
+ for char in text:
38
+ page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5)))
39
+
40
+
41
+ def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_sleep: bool = True) -> None:
42
+ """Presses a key with a random delay, optionally sleeping between presses."""
43
+ for _ in range(count):
44
+ page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5)))
45
+ if add_sleep:
46
+ time.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000)
47
+
48
+
49
+ ########################################################################################################################
50
+ # CRAWLER CLASS
51
+
52
+
53
+ class PlaywrightCrawler:
54
+ """A robust, proxy-enabled Playwright crawler with captcha bypass and retry logic."""
55
+
56
+ def __init__(self, proxy_interface: ProxyInterface):
57
+ """
58
+ Initializes the crawler with a proxy interface.
59
+
60
+ Args:
61
+ proxy_interface (ProxyInterface): An object to fetch proxy credentials.
62
+ """
63
+ self.proxy_interface = proxy_interface
64
+ self.pw: Optional[Camoufox] = None
65
+ self.browser: Optional[Browser] = None
66
+ self.context: Optional[BrowserContext] = None
67
+ self.page: Optional[Page] = None
68
+
69
+ def __enter__(self) -> Self:
70
+ """Initializes the browser context when entering the `with` statement."""
71
+ self.init_context()
72
+ return self
73
+
74
+ def __exit__(
75
+ self,
76
+ exc_type: Optional[type[BaseException]],
77
+ exc_val: Optional[BaseException],
78
+ exc_tb: Optional[TracebackType],
79
+ ) -> None:
80
+ """Safely closes the browser context upon exit."""
81
+ if self.pw:
82
+ self.pw.__exit__(exc_type, exc_val, exc_tb)
83
+
84
+ @retry(
85
+ wait=wait_exponential(exp_base=2, multiplier=3, max=90),
86
+ stop=stop_after_delay(timedelta(minutes=10)),
87
+ before_sleep=before_sleep_log(logger, logging.INFO),
88
+ reraise=True,
89
+ )
90
+ def init_context(self) -> Self:
91
+ """Initializes a new browser instance and context with a fresh proxy."""
92
+ try:
93
+ host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
94
+ proxy_url = f"http://{host}:{port}"
95
+ proxy_cfg = {"server": proxy_url}
96
+
97
+ if user and pwd:
98
+ proxy_cfg.update({"username": user, "password": pwd})
99
+
100
+ logger.info(f"Starting browser with proxy: {proxy_url}")
101
+ self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
102
+ self.browser = self.pw.__enter__()
103
+ self.context = self.browser.new_context()
104
+ self.page = self.context.new_page()
105
+ except Exception as e:
106
+ logger.error(f"Failed to initialize browser context: {e}")
107
+ if self.pw:
108
+ self.pw.__exit__(type(e), e, e.__traceback__)
109
+ raise
110
+ return self
111
+
112
+ def restart_context(self) -> None:
113
+ """Closes the current browser instance and initializes a new one."""
114
+ logger.info("Restarting browser context...")
115
+ if self.pw:
116
+ self.pw.__exit__(None, None, None)
117
+ self.init_context()
118
+
119
+ @retry(
120
+ retry=retry_if_exception_type((PlaywrightTimeoutError, PlaywrightError)),
121
+ wait=wait_exponential(exp_base=2, multiplier=3, max=90),
122
+ stop=stop_after_delay(timedelta(minutes=10)),
123
+ before_sleep=before_sleep_log(logger, logging.INFO),
124
+ before=lambda rs: rs.args[0].restart_context() if rs.attempt_number > 1 else None,
125
+ reraise=True,
126
+ )
127
+ def _goto_with_retry(self, url: str) -> Page:
128
+ """
129
+ Navigates to a URL with retries for common Playwright errors.
130
+ Restarts the browser context on repeated failures.
131
+ """
132
+ if not (self.page and not self.page.is_closed()):
133
+ logger.warning("Page is not available or closed. Restarting context.")
134
+ self.restart_context()
135
+
136
+ # self.page is guaranteed to be valid here by the logic above
137
+ assert self.page is not None
138
+ self.page.goto(url, timeout=30000, wait_until="domcontentloaded")
139
+ return self.page
140
+
141
+ def goto(self, url: str) -> Page:
142
+ """
143
+ Ensures the browser is initialized and navigates to the given URL.
144
+ Public wrapper for the internal retry-enabled navigation method.
145
+ """
146
+ if not self.page:
147
+ logger.info("Browser context not found, initializing now...")
148
+ self.init_context()
149
+ return self._goto_with_retry(url)
@@ -1,23 +0,0 @@
1
- ########################################################################################################################
2
- # IMPORTS
3
-
4
- import asyncio
5
- from random import randint
6
-
7
- from playwright.async_api import Page
8
-
9
-
10
- ########################################################################################################################
11
- # FUNCTIONS
12
-
13
-
14
- async def human_type(page: Page, text: str, delay: int = 100):
15
- for char in text:
16
- await page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
17
-
18
-
19
- async def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, sleep=True):
20
- for _ in range(count):
21
- await page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
22
- if sleep:
23
- await asyncio.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000) # noqa: S311
@@ -1,23 +0,0 @@
1
- ########################################################################################################################
2
- # IMPORTS
3
-
4
- import time
5
- from random import randint
6
-
7
- from playwright.sync_api import Page
8
-
9
-
10
- ########################################################################################################################
11
- # FUNCTIONS
12
-
13
-
14
- def human_type(page: Page, text: str, delay: int = 100):
15
- for char in text:
16
- page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
17
-
18
-
19
- def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, sleep=True):
20
- for _ in range(count):
21
- page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
22
- if sleep:
23
- time.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000) # noqa: S311
File without changes
File without changes