datamarket 0.7.89__py3-none-any.whl → 0.7.125__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,29 +1,38 @@
1
1
  ########################################################################################################################
2
2
  # IMPORTS
3
3
 
4
+ import json
4
5
  import logging
5
6
  import time
6
7
  from datetime import timedelta
7
8
  from random import randint
8
9
  from types import TracebackType
9
- from typing import Optional, Self
10
+ from typing import Optional, Self, Sequence
10
11
 
12
+ from bs4 import BeautifulSoup
11
13
  from camoufox import Camoufox
14
+ from playwright.sync_api import Browser, BrowserContext, Page, Response
12
15
  from playwright.sync_api import (
13
- Browser,
14
- BrowserContext,
15
16
  Error as PlaywrightError,
16
- Page,
17
+ )
18
+ from playwright.sync_api import (
17
19
  TimeoutError as PlaywrightTimeoutError,
18
20
  )
21
+ from requests.exceptions import HTTPError, ProxyError
19
22
  from tenacity import (
20
23
  before_sleep_log,
21
24
  retry,
22
25
  retry_if_exception_type,
26
+ retry_if_not_exception_type,
27
+ stop_after_attempt,
23
28
  stop_after_delay,
24
29
  wait_exponential,
25
30
  )
31
+
32
+ from datamarket.exceptions import BadRequestError, EmptyResponseError, NotFoundError, RedirectionDetectedError
33
+ from datamarket.exceptions.main import IgnoredHTTPError
26
34
  from datamarket.interfaces.proxy import ProxyInterface
35
+ from datamarket.utils.main import ban_sleep
27
36
 
28
37
  ########################################################################################################################
29
38
  # SETUP LOGGER
@@ -33,32 +42,38 @@ logger = logging.getLogger(__name__)
33
42
 
34
43
  ########################################################################################################################
35
44
  # HELPER FUNCTIONS
45
+
46
+
36
47
  def human_type(page: Page, text: str, delay: int = 100):
37
48
  for char in text:
38
- page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5)))
49
+ page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
39
50
 
40
51
 
41
52
  def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_sleep: bool = True) -> None:
42
53
  """Presses a key with a random delay, optionally sleeping between presses."""
43
54
  for _ in range(count):
44
- page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5)))
55
+ page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
45
56
  if add_sleep:
46
- time.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000)
57
+ time.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000) # noqa: S311
47
58
 
48
59
 
49
60
  ########################################################################################################################
50
- # CRAWLER CLASS
61
+ # CLASSES
51
62
 
52
63
 
53
64
  class PlaywrightCrawler:
54
65
  """A robust, proxy-enabled Playwright crawler with captcha bypass and retry logic."""
55
66
 
56
- def __init__(self, proxy_interface: ProxyInterface):
67
+ _REDIRECT_STATUS_CODES = set(range(300, 309))
68
+
69
+ def __init__(self, proxy_interface: Optional[ProxyInterface] = None):
57
70
  """
58
- Initializes the crawler with a proxy interface.
71
+ Initializes the crawler.
59
72
 
60
73
  Args:
61
- proxy_interface (ProxyInterface): An object to fetch proxy credentials.
74
+ proxy_interface (Optional[ProxyInterface], optional): Provider used to fetch
75
+ proxy credentials. Defaults to None. When None, no proxy is configured and
76
+ the browser will run without a proxy.
62
77
  """
63
78
  self.proxy_interface = proxy_interface
64
79
  self.pw: Optional[Camoufox] = None
@@ -79,7 +94,34 @@ class PlaywrightCrawler:
79
94
  ) -> None:
80
95
  """Safely closes the browser context upon exit."""
81
96
  if self.pw:
82
- self.pw.__exit__(exc_type, exc_val, exc_tb)
97
+ try:
98
+ self.pw.__exit__(exc_type, exc_val, exc_tb)
99
+ except Exception as e:
100
+ logger.warning(f"Error closing browser: {e}")
101
+ finally:
102
+ self.pw = None
103
+ self.browser = None
104
+ self.context = None
105
+ self.page = None
106
+
107
+ def _build_proxy_config(self) -> Optional[dict]:
108
+ """Builds the proxy configuration dictionary.
109
+
110
+ Returns:
111
+ Optional[dict]: Proxy configuration if a proxy_interface is provided; otherwise None.
112
+ """
113
+ if not self.proxy_interface:
114
+ logger.info("Starting browser without proxy.")
115
+ return None
116
+
117
+ host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
118
+ proxy_url = f"http://{host}:{port}"
119
+ proxy_cfg: dict = {"server": proxy_url}
120
+ if user and pwd:
121
+ proxy_cfg.update({"username": user, "password": pwd})
122
+
123
+ logger.info(f"Starting browser with proxy: {proxy_url}")
124
+ return proxy_cfg
83
125
 
84
126
  @retry(
85
127
  wait=wait_exponential(exp_base=2, multiplier=3, max=90),
@@ -88,16 +130,20 @@ class PlaywrightCrawler:
88
130
  reraise=True,
89
131
  )
90
132
  def init_context(self) -> Self:
91
- """Initializes a new browser instance and context with a fresh proxy."""
92
- try:
93
- host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
94
- proxy_url = f"http://{host}:{port}"
95
- proxy_cfg = {"server": proxy_url}
133
+ """
134
+ Initializes a new browser instance and context.
96
135
 
97
- if user and pwd:
98
- proxy_cfg.update({"username": user, "password": pwd})
136
+ Behavior:
137
+ - If a proxy_interface is provided, fetches fresh proxy credentials and starts
138
+ the browser using that proxy.
139
+ - If proxy_interface is None, starts the browser without any proxy.
140
+
141
+ Returns:
142
+ Self: The crawler instance with active browser, context, and page.
143
+ """
144
+ try:
145
+ proxy_cfg: Optional[dict] = self._build_proxy_config()
99
146
 
100
- logger.info(f"Starting browser with proxy: {proxy_url}")
101
147
  self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
102
148
  self.browser = self.pw.__enter__()
103
149
  self.context = self.browser.new_context()
@@ -124,7 +170,7 @@ class PlaywrightCrawler:
124
170
  before=lambda rs: rs.args[0].restart_context() if rs.attempt_number > 1 else None,
125
171
  reraise=True,
126
172
  )
127
- def _goto_with_retry(self, url: str) -> Page:
173
+ def _goto_with_retry(self, url: str, timeout: int = 30_000) -> Response:
128
174
  """
129
175
  Navigates to a URL with retries for common Playwright errors.
130
176
  Restarts the browser context on repeated failures.
@@ -133,12 +179,10 @@ class PlaywrightCrawler:
133
179
  logger.warning("Page is not available or closed. Restarting context.")
134
180
  self.restart_context()
135
181
 
136
- # self.page is guaranteed to be valid here by the logic above
137
- assert self.page is not None
138
- self.page.goto(url, timeout=30000, wait_until="domcontentloaded")
139
- return self.page
182
+ response = self.page.goto(url, timeout=timeout, wait_until="domcontentloaded")
183
+ return response
140
184
 
141
- def goto(self, url: str) -> Page:
185
+ def goto(self, url: str, max_proxy_delay: timedelta = timedelta(minutes=10), timeout: int = 30_000) -> Response:
142
186
  """
143
187
  Ensures the browser is initialized and navigates to the given URL.
144
188
  Public wrapper for the internal retry-enabled navigation method.
@@ -146,4 +190,92 @@ class PlaywrightCrawler:
146
190
  if not self.page:
147
191
  logger.info("Browser context not found, initializing now...")
148
192
  self.init_context()
149
- return self._goto_with_retry(url)
193
+ return self._goto_with_retry.retry_with(stop=stop_after_delay(max_proxy_delay))(self, url, timeout)
194
+
195
+ def _handle_http_error(self, status_code: int, url: str, response, allow_redirects: bool = True) -> None:
196
+ """
197
+ Handle HTTP errors with special handling for redirects.
198
+
199
+ Args:
200
+ status_code: HTTP status code
201
+ url: Request URL
202
+ response: Response object
203
+
204
+ Raises:
205
+ RedirectionDetectedError: If a redirect status is received
206
+ NotFoundError: For 404/410 errors
207
+ BadRequestError: For 400 errors
208
+ HTTPError: For other non-2xx status codes
209
+ """
210
+
211
+ if not allow_redirects and response.request.redirected_from: # noqa: F841
212
+ raise RedirectionDetectedError(
213
+ message=f"HTTP redirect detected from {response.request.redirected_from.url} to {response.request.redirected_from.redirected_to.url}",
214
+ response=response,
215
+ )
216
+
217
+ # Standard error handlers
218
+ error_handlers = {
219
+ 404: lambda: NotFoundError(message=f"404 Not Found error for {url}", response=response),
220
+ 410: lambda: NotFoundError(message=f"410 Gone error for {url}", response=response),
221
+ 400: lambda: BadRequestError(message=f"400 Bad Request error for {url}", response=response),
222
+ }
223
+
224
+ if status_code in error_handlers:
225
+ raise error_handlers[status_code]()
226
+
227
+ # Raise for any other non-2xx status
228
+ if status_code >= 400:
229
+ raise HTTPError(f"Navigation failed: {status_code} - {url}", response=response)
230
+
231
+ @retry(
232
+ retry=retry_if_not_exception_type(
233
+ (IgnoredHTTPError, NotFoundError, BadRequestError, RedirectionDetectedError, ProxyError)
234
+ ),
235
+ wait=wait_exponential(exp_base=3, multiplier=3, max=60),
236
+ stop=stop_after_attempt(5),
237
+ before_sleep=before_sleep_log(logger, logging.WARNING),
238
+ reraise=False,
239
+ retry_error_callback=lambda rs: None,
240
+ )
241
+ def get_data(
242
+ self,
243
+ url: str,
244
+ output: str = "json",
245
+ sleep: tuple = (6, 3),
246
+ max_proxy_delay: timedelta = timedelta(minutes=10),
247
+ ignored_status_codes: Sequence[int] = (),
248
+ timeout: int = 30_000,
249
+ **kwargs,
250
+ ):
251
+ """Crawls a given URL using Playwright and attempts to parse its body content"""
252
+
253
+ params = kwargs.copy()
254
+
255
+ allow_redirects = params.get("allow_redirects", True)
256
+
257
+ logger.info(f"Fetching data from {url} ...")
258
+ r = self.goto(url, max_proxy_delay, timeout)
259
+ ban_sleep(*sleep)
260
+ body_content = self.page.eval_on_selector("body", "body => body.innerText")
261
+
262
+ if r.status in ignored_status_codes:
263
+ raise IgnoredHTTPError(message=f"Status {r.status} in ignored_status_codes for URL {url}", response=r)
264
+
265
+ # Handle HTTP errors with redirect detection
266
+ self._handle_http_error(r.status, url, r, allow_redirects)
267
+
268
+ if not body_content:
269
+ raise EmptyResponseError(message=f"Empty body received from {url} (status {r.status})", response=r)
270
+
271
+ output_format = {
272
+ "json": lambda: json.loads(body_content),
273
+ "text": lambda: body_content,
274
+ "soup": lambda: BeautifulSoup(body_content, "html.parser"),
275
+ "response": lambda: r,
276
+ }
277
+
278
+ if output in output_format:
279
+ return output_format[output]()
280
+
281
+ raise ValueError(f"Unsupported output format: {output}")