datamarket 0.7.89__py3-none-any.whl → 0.7.125__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamarket/utils/main.py CHANGED
@@ -9,22 +9,12 @@ import re
9
9
  import shlex
10
10
  import subprocess
11
11
  import time
12
+ from datetime import timedelta
13
+ from typing import Sequence, overload
12
14
 
13
- from bs4 import BeautifulSoup
14
15
  import pendulum
15
- import requests
16
- from requests.exceptions import ProxyError
17
- from tenacity import (
18
- before_sleep_log,
19
- retry,
20
- retry_if_exception_type,
21
- retry_if_not_exception_type,
22
- stop_after_attempt,
23
- stop_after_delay,
24
- wait_exponential,
25
- )
26
-
27
- from ..exceptions import RedirectionDetectedError, NotFoundError
16
+ from babel.numbers import parse_decimal
17
+
28
18
  from ..interfaces.proxy import ProxyInterface
29
19
 
30
20
  ########################################################################################################################
@@ -49,15 +39,63 @@ def set_logger(level):
49
39
  log.addHandler(ch)
50
40
 
51
41
 
52
- def ban_sleep(max_time, min_time=0):
53
- sleep_time = int(random.uniform(min_time, max_time))
54
- logger.info(f"sleeping for {sleep_time} seconds...")
42
+ @overload
43
+ def ban_sleep(max_time: float) -> None: ...
44
+
45
+
46
+ @overload
47
+ def ban_sleep(min_time: float, max_time: float) -> None: ...
48
+
49
+
50
+ def ban_sleep(x: float, y: float | None = None) -> None:
51
+ """
52
+ Sleep for a random number of seconds.
53
+
54
+ Usage:
55
+ ban_sleep(5) -> sleeps ~N(5, 2.5²) seconds, truncated to >= 0
56
+ ban_sleep(3, 7) -> sleeps uniformly between 3 and 7 seconds
57
+ ban_sleep(7, 3) -> same as above (order doesn't matter)
58
+ """
59
+ if y is None:
60
+ mean = float(x)
61
+ std_dev = mean / 2.0
62
+ sleep_time = random.gauss(mean, std_dev) # noqa: S311
63
+ sleep_time = max(0.0, sleep_time)
64
+ else:
65
+ x, y = sorted([float(x), float(y)])
66
+ sleep_time = random.uniform(x, y) # noqa: S311
67
+
68
+ logger.info(f"sleeping for {sleep_time:.2f} seconds...")
55
69
  time.sleep(sleep_time)
56
70
 
57
71
 
58
- async def ban_sleep_async(max_time, min_time=0):
59
- sleep_time = int(random.uniform(min_time, max_time)) # noqa: S311
60
- logger.info(f"sleeping for {sleep_time} seconds...")
72
+ @overload
73
+ async def ban_sleep_async(seconds: float) -> None: ...
74
+
75
+
76
+ @overload
77
+ async def ban_sleep_async(min_time: float, max_time: float) -> None: ...
78
+
79
+
80
+ async def ban_sleep_async(min_time: float, max_time: float | None = None) -> None:
81
+ """
82
+ Asynchronous sleep for a random number of seconds.
83
+
84
+ Usage:
85
+ await ban_sleep_async(5) # sleeps ~N(5, (5/2)²) seconds, truncated to >= 0
86
+ await ban_sleep_async(3, 7) # sleeps uniformly between 3 and 7 seconds
87
+ await ban_sleep_async(7, 3) # same as above (order doesn't matter)
88
+ """
89
+ if max_time is None:
90
+ mean = float(min_time)
91
+ std_dev = mean / 2.0
92
+ sleep_time = random.gauss(mean, std_dev) # noqa: S311
93
+ sleep_time = max(0.0, sleep_time)
94
+ else:
95
+ min_time, max_time = sorted([float(min_time), float(max_time)])
96
+ sleep_time = random.uniform(min_time, max_time) # noqa: S311
97
+
98
+ logger.info(f"sleeping for {sleep_time:.2f} seconds...")
61
99
  await asyncio.sleep(sleep_time)
62
100
 
63
101
 
@@ -89,6 +127,19 @@ def text_to_int(text):
89
127
  return num
90
128
 
91
129
 
130
+ def text_to_float(text: str | None, locale: str = "es_ES") -> float | None:
131
+ if not text:
132
+ return None
133
+ match = re.search(r"\d(?:[\d\s.,]*\d)?", text)
134
+ if not match:
135
+ return None
136
+ number_str = match.group(0).replace(" ", "")
137
+ try:
138
+ return float(parse_decimal(number_str, locale=locale))
139
+ except Exception:
140
+ return None
141
+
142
+
92
143
  def sleep_out_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
93
144
  while pendulum.now(tz=tz).hour >= to_h or pendulum.now(tz=tz).hour < from_h:
94
145
  logger.warning("time to sleep and not scrape anything...")
@@ -116,13 +167,6 @@ def parse_field(dict_struct, field_path, format_method=None):
116
167
  return format_method(field_value) if format_method else field_value
117
168
 
118
169
 
119
- @retry(
120
- retry=retry_if_not_exception_type((NotFoundError, RedirectionDetectedError, ProxyError)),
121
- wait=wait_exponential(exp_base=3, multiplier=3, max=60),
122
- stop=stop_after_attempt(5),
123
- before_sleep=before_sleep_log(logger, logging.WARNING),
124
- reraise=True,
125
- )
126
170
  def get_data(
127
171
  url: str,
128
172
  method: str = "GET",
@@ -130,45 +174,49 @@ def get_data(
130
174
  sleep: tuple = (6, 3),
131
175
  proxy_interface: ProxyInterface = None,
132
176
  use_auth_proxies: bool = False,
133
- max_proxy_delay: int = 1800,
177
+ max_proxy_delay: timedelta = timedelta(minutes=10),
178
+ ignored_status_codes: Sequence[int] = (),
134
179
  **kwargs,
135
180
  ):
136
- retry_type = retry_if_exception_type(ProxyError)
137
- wait = wait_exponential(exp_base=3, multiplier=3, max=60)
138
- stop = stop_after_delay(max_proxy_delay)
139
- before_sleep = before_sleep_log(logger, logging.WARNING)
140
-
141
- @retry(retry=retry_type, wait=wait, stop=stop, before_sleep=before_sleep, reraise=True)
142
- def _fetch_with_proxy_retry(url, method, proxy_interface, use_auth, **params):
143
- logger.info(f"Fetching data from {url} ...")
144
- proxy_cfg = None
145
- if proxy_interface:
146
- host, port, user, pwd = proxy_interface.get_proxies(raw=True, use_auth=use_auth)
147
- if host and port:
148
- proxy_url = f"http://{host}:{port}"
149
- proxy_auth_url = f"http://{user}:{pwd}@{host}:{port}"
150
- proxy_cfg = {"http": proxy_url, "https": proxy_url}
151
- if user and pwd:
152
- proxy_cfg = {"http": proxy_auth_url, "https": proxy_auth_url}
153
- logger.info(f"Using proxy: {proxy_url}")
154
- response = getattr(requests, method.lower())(url, proxies=proxy_cfg, **params)
155
- return response
156
-
157
- params = {"timeout": 30} | kwargs
158
- r = _fetch_with_proxy_retry(url, method, proxy_interface, use_auth_proxies, **params)
159
-
160
- ban_sleep(*sleep)
161
-
162
- if r.status_code == 404:
163
- raise NotFoundError(f"404 Not Found error for {url}")
164
- r.raise_for_status()
165
- r.encoding = "utf-8"
166
-
167
- if output == "json":
168
- return r.json()
169
- elif output == "text":
170
- return r.text
171
- elif output == "soup":
172
- return BeautifulSoup(r.content, "html.parser")
173
- elif output == "response":
174
- return r
181
+ """
182
+ Fetches data from a given URL using HTTP requests, with support for proxy configuration, retries, and flexible output formats.
183
+
184
+ Args:
185
+ url (str): The target URL to fetch data from.
186
+ method (str, optional): HTTP method to use (e.g., 'GET', 'POST'). Defaults to 'GET'.
187
+ output (str, optional): Output format ('json', 'text', 'soup', 'response'). Defaults to 'json'.
188
+ sleep (tuple, optional): Tuple specifying max and min sleep times (seconds) after request. Defaults to (6, 3).
189
+ use_auth_proxies (bool, optional): Whether to use authenticated proxies. Defaults to False.
190
+ max_proxy_delay (timedelta, optional): Maximum delay for proxy retry logic. Defaults to 10 minutes.
191
+ ignored_status_codes (Sequence[int], optional): Status codes to ignore and return response for. Defaults to ().
192
+ **kwargs: Additional arguments passed to the requests method (timeout defaults to 30 seconds if not specified).
193
+
194
+ Returns:
195
+ Depends on the 'output' argument:
196
+ - 'json': Parsed JSON response.
197
+ - 'text': Response text.
198
+ - 'soup': BeautifulSoup-parsed HTML.
199
+ - 'response': Raw requests.Response object.
200
+
201
+ Raises:
202
+ IgnoredHTTPError: If a response status code is in `ignored_status_codes`.
203
+ NotFoundError: If a 404 or 410 status code is returned and not in `ignored_status_codes`.
204
+ BadRequestError: If a 400 status code is returned and not in `ignored_status_codes`.
205
+ EmptyResponseError: If the response has no content.
206
+ ProxyError: On proxy-related errors.
207
+ requests.HTTPError: For other HTTP errors if not ignored.
208
+ """
209
+
210
+ from .requests import RequestsClient
211
+
212
+ client = RequestsClient(proxy_interface)
213
+ return client.get_data(
214
+ url=url,
215
+ method=method,
216
+ output=output,
217
+ sleep=sleep,
218
+ use_auth_proxies=use_auth_proxies,
219
+ max_proxy_delay=max_proxy_delay,
220
+ ignored_status_codes=ignored_status_codes,
221
+ **kwargs,
222
+ )
@@ -0,0 +1,201 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ from typing import Literal, Optional
5
+
6
+ from rapidfuzz import fuzz, process
7
+ from unidecode import unidecode
8
+
9
+ from ..params.nominatim import (
10
+ _NORMALIZED_PROVINCE_CACHE,
11
+ COUNTRY_PARSING_RULES,
12
+ POSTCODE_TO_STATES,
13
+ PROVINCE_TO_POSTCODE,
14
+ PROVINCES,
15
+ STANDARD_THRESHOLD,
16
+ STATES,
17
+ )
18
+ from .strings import normalize
19
+
20
+ ########################################################################################################################
21
+ # FUNCTIONS
22
+
23
+
24
+ def standardize_admin_division(
25
+ name: str,
26
+ level: Literal["province", "state"] = "province",
27
+ country_code: str = "es",
28
+ ) -> Optional[str]:
29
+ """
30
+ Normalize and standardize administrative divisions of a given country using RapidFuzz.
31
+ Uses normalized dict keys for comparison and returns dict values with the official names.
32
+ """
33
+ if not name:
34
+ return None
35
+
36
+ country_code = country_code.lower()
37
+ mapping = STATES.get(country_code) if level == "state" else PROVINCES.get(country_code)
38
+
39
+ if not mapping: # If country is not standardized, return raw name
40
+ return name
41
+
42
+ normalized_name = normalize(name) # Essential for rapidfuzz to work well
43
+ result = process.extractOne(
44
+ normalized_name,
45
+ mapping.keys(), # Compare with the normalized names in the dict
46
+ scorer=fuzz.WRatio,
47
+ score_cutoff=STANDARD_THRESHOLD,
48
+ )
49
+
50
+ if not result:
51
+ return None
52
+
53
+ best_key, score, _ = result
54
+
55
+ # Return the standardized name corresponding to the normalized name
56
+ return mapping[best_key]
57
+
58
+
59
+ def parse_state(
60
+ zip_code: str,
61
+ country_code: str,
62
+ ) -> str | None:
63
+ """Given a zip code and a country code, returns the state in which the zip code is located
64
+
65
+ Args:
66
+ zip_code (str)
67
+ country_code (str)
68
+
69
+ Returns:
70
+ str | None: state if coincidence found, else None
71
+ """
72
+ country_postcodes = POSTCODE_TO_STATES.get(country_code, {})
73
+ state = country_postcodes.get(zip_code[:2], None)
74
+ return state
75
+
76
+
77
+ def _province_postcode_match(
78
+ address: str,
79
+ zip_code: str,
80
+ country_code: str,
81
+ ) -> str | None:
82
+ """
83
+ Match and return province with the start of all of its zip codes
84
+ using a pre-computed cache and rapidfuzz for efficient matching.
85
+
86
+ Args:
87
+ address (str)
88
+ zip_code (str)
89
+ country_code (str)
90
+
91
+ Returns:
92
+ str | None:
93
+ """
94
+ # Get the pre-computed cache for the country
95
+ cache = _NORMALIZED_PROVINCE_CACHE.get(country_code)
96
+ if not cache:
97
+ return None # Country not configured
98
+
99
+ normalized_address = unidecode(address).lower()
100
+
101
+ # Use the cached 'choices' list for the search
102
+ result = process.extractOne(
103
+ normalized_address,
104
+ cache["choices"], # <-- Uses pre-computed list
105
+ scorer=fuzz.partial_ratio,
106
+ score_cutoff=100,
107
+ )
108
+
109
+ if not result:
110
+ return None # No exact substring match found
111
+
112
+ # We only need the index from the result
113
+ _, _, index = result
114
+
115
+ # Get the original province name from the cached 'keys' list
116
+ original_province = cache["keys"][index] # <-- Uses pre-computed list
117
+
118
+ # Get the postcode prefix from the original map
119
+ province_map = PROVINCE_TO_POSTCODE.get(country_code, {})
120
+ postcode_prefix = province_map[original_province]
121
+
122
+ return postcode_prefix + zip_code[1:] if len(zip_code) == 4 else zip_code
123
+
124
+
125
+ def _parse_es_zip_code(
126
+ zip_code: str,
127
+ address: str,
128
+ opt_address: str | None,
129
+ ) -> str:
130
+ """parse spain zip code"""
131
+
132
+ # Get the validation regex from params
133
+ validate_regex = COUNTRY_PARSING_RULES["es"]["zip_validate_pattern"]
134
+
135
+ if validate_regex.match(zip_code):
136
+ return zip_code
137
+ else:
138
+ # Use search regex from params
139
+ pattern = COUNTRY_PARSING_RULES["es"]["zip_search_pattern"]
140
+
141
+ match = pattern.search(address)
142
+ if match:
143
+ return match.group()
144
+ if opt_address:
145
+ match = pattern.search(opt_address)
146
+ if match:
147
+ return match.group()
148
+
149
+ province_match = _province_postcode_match(address, zip_code, country_code="es")
150
+ return province_match or zip_code
151
+
152
+
153
+ def _parse_pt_zip_code(
154
+ zip_code: str,
155
+ address: str,
156
+ opt_address: str | None,
157
+ ) -> str:
158
+ """parse portugal zip code"""
159
+
160
+ # Get the validation regex from params
161
+ validate_regex = COUNTRY_PARSING_RULES["pt"]["zip_validate_pattern"]
162
+
163
+ if validate_regex.match(zip_code):
164
+ return zip_code
165
+ else:
166
+ # Use search regex from params
167
+ pattern = COUNTRY_PARSING_RULES["pt"]["zip_search_pattern"]
168
+
169
+ match = pattern.search(address)
170
+ if match is None and opt_address:
171
+ match = pattern.search(opt_address)
172
+
173
+ return match.group() if match else zip_code
174
+
175
+
176
+ def parse_zip_code(
177
+ address: str,
178
+ zip_code: str,
179
+ country_code: str,
180
+ opt_address: str | None = None,
181
+ ) -> str | None:
182
+ """Parse and standardize zip code
183
+
184
+ Args:
185
+ address (str): written address
186
+ zip_code (str)
187
+ country_code (str):
188
+ opt_address (str | None, optional): optional extra address, usually None. Defaults to None.
189
+
190
+ Raises:
191
+ ValueError: when parsing zip code is not supported for the passed country_code
192
+
193
+ Returns:
194
+ str | None
195
+ """
196
+ if country_code == "es":
197
+ return _parse_es_zip_code(zip_code, address, opt_address)
198
+ elif country_code == "pt":
199
+ return _parse_pt_zip_code(zip_code, address, opt_address)
200
+ else:
201
+ raise ValueError(f"Country code ({country_code}) is not currently supported")
@@ -2,31 +2,35 @@
2
2
  # IMPORTS
3
3
 
4
4
  import asyncio
5
+ import json
5
6
  import logging
6
7
  from datetime import timedelta
7
8
  from random import randint
8
9
  from types import TracebackType
9
- from typing import Optional, Self
10
+ from typing import Optional, Self, Sequence
10
11
 
11
- # 'BdbQuit' import is removed as it's no longer used
12
+ from bs4 import BeautifulSoup
12
13
  from camoufox.async_api import AsyncCamoufox as Camoufox
14
+ from playwright.async_api import Browser, BrowserContext, Page, Response
13
15
  from playwright.async_api import (
14
- Browser,
15
- BrowserContext,
16
16
  Error as PlaywrightError,
17
- Page,
18
- TimeoutError as PlaywrightTimeoutError,
19
17
  )
18
+ from playwright.async_api import TimeoutError as PlaywrightTimeoutError
19
+ from requests.exceptions import HTTPError, ProxyError
20
20
  from tenacity import (
21
21
  before_sleep_log,
22
22
  retry,
23
23
  retry_if_exception_type,
24
+ retry_if_not_exception_type,
25
+ stop_after_attempt,
24
26
  stop_after_delay,
25
27
  wait_exponential,
26
28
  )
27
29
 
30
+ from datamarket.exceptions import BadRequestError, EmptyResponseError, NotFoundError, RedirectionDetectedError
31
+ from datamarket.exceptions.main import IgnoredHTTPError
28
32
  from datamarket.interfaces.proxy import ProxyInterface
29
-
33
+ from datamarket.utils.main import ban_sleep_async
30
34
 
31
35
  ########################################################################################################################
32
36
  # SETUP LOGGER
@@ -39,15 +43,15 @@ logger = logging.getLogger(__name__)
39
43
 
40
44
  async def human_type(page: Page, text: str, delay: int = 100):
41
45
  for char in text:
42
- await page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5)))
46
+ await page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
43
47
 
44
48
 
45
49
  async def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_sleep: bool = True) -> None:
46
50
  """Asynchronously presses a key with a random delay, optionally sleeping between presses."""
47
51
  for _ in range(count):
48
- await page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5)))
52
+ await page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
49
53
  if add_sleep:
50
- await asyncio.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000)
54
+ await asyncio.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000) # noqa: S311
51
55
 
52
56
 
53
57
  ########################################################################################################################
@@ -57,12 +61,16 @@ async def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100
57
61
  class PlaywrightCrawler:
58
62
  """An robust, proxy-enabled asynchronous Playwright crawler with captcha bypass and retry logic."""
59
63
 
60
- def __init__(self, proxy_interface: ProxyInterface):
64
+ _REDIRECT_STATUS_CODES = set(range(300, 309))
65
+
66
+ def __init__(self, proxy_interface: Optional[ProxyInterface] = None):
61
67
  """
62
- Initializes the async crawler with a proxy interface.
68
+ Initializes the async crawler.
63
69
 
64
70
  Args:
65
- proxy_interface (ProxyInterface): An async-compatible object to fetch proxy credentials.
71
+ proxy_interface (Optional[ProxyInterface], optional): Provider used to fetch
72
+ proxy credentials. Defaults to None. When None, no proxy is configured and
73
+ the browser will run without a proxy.
66
74
  """
67
75
  self.proxy_interface = proxy_interface
68
76
  self.pw: Optional[Camoufox] = None
@@ -85,6 +93,25 @@ class PlaywrightCrawler:
85
93
  if self.pw:
86
94
  await self.pw.__aexit__(exc_type, exc_val, exc_tb)
87
95
 
96
+ async def _build_proxy_config(self) -> Optional[dict]:
97
+ """Builds the proxy configuration dictionary.
98
+
99
+ Returns:
100
+ Optional[dict]: Proxy configuration if a proxy_interface is provided; otherwise None.
101
+ """
102
+ if not self.proxy_interface:
103
+ logger.info("Starting browser without proxy.")
104
+ return None
105
+
106
+ host, port, user, pwd = await asyncio.to_thread(self.proxy_interface.get_proxies, raw=True, use_auth=True)
107
+ proxy_url = f"http://{host}:{port}"
108
+ proxy_cfg: dict = {"server": proxy_url}
109
+ if user and pwd:
110
+ proxy_cfg.update({"username": user, "password": pwd})
111
+
112
+ logger.info(f"Starting browser with proxy: {proxy_url}")
113
+ return proxy_cfg
114
+
88
115
  @retry(
89
116
  wait=wait_exponential(exp_base=2, multiplier=3, max=90),
90
117
  stop=stop_after_delay(timedelta(minutes=10)),
@@ -92,17 +119,20 @@ class PlaywrightCrawler:
92
119
  reraise=True,
93
120
  )
94
121
  async def init_context(self) -> Self:
95
- """Initializes a new async browser instance and context with a fresh proxy."""
96
- try:
97
- # Correctly wrap the blocking I/O call
98
- host, port, user, pwd = await asyncio.to_thread(self.proxy_interface.get_proxies, raw=True, use_auth=True)
99
- proxy_url = f"http://{host}:{port}"
100
- proxy_cfg = {"server": proxy_url}
122
+ """
123
+ Initializes a new async browser instance and context.
101
124
 
102
- if user and pwd:
103
- proxy_cfg.update({"username": user, "password": pwd})
125
+ Behavior:
126
+ - If a proxy_interface is provided, fetches fresh proxy credentials and starts
127
+ the browser using that proxy.
128
+ - If proxy_interface is None, starts the browser without any proxy.
129
+
130
+ Returns:
131
+ Self: The crawler instance with active browser, context, and page.
132
+ """
133
+ try:
134
+ proxy_cfg: Optional[dict] = await self._build_proxy_config()
104
135
 
105
- logger.info(f"Starting browser with proxy: {proxy_url}")
106
136
  self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
107
137
  self.browser = await self.pw.__aenter__()
108
138
  self.context = await self.browser.new_context()
@@ -128,7 +158,7 @@ class PlaywrightCrawler:
128
158
  before_sleep=before_sleep_log(logger, logging.INFO),
129
159
  reraise=True,
130
160
  )
131
- async def _goto_with_retry(self, url: str) -> Page:
161
+ async def _goto_with_retry(self, url: str, timeout: int = 30_000) -> Response:
132
162
  """
133
163
  Asynchronously navigates to a URL with retries for common Playwright errors.
134
164
  Restarts the browser context on repeated failures.
@@ -137,11 +167,12 @@ class PlaywrightCrawler:
137
167
  logger.warning("Page is not available or closed. Restarting context.")
138
168
  await self.restart_context()
139
169
 
140
- assert self.page is not None
141
- await self.page.goto(url, timeout=30000, wait_until="domcontentloaded")
142
- return self.page
170
+ response = await self.page.goto(url, timeout=timeout, wait_until="domcontentloaded")
171
+ return response
143
172
 
144
- async def goto(self, url: str) -> Page:
173
+ async def goto(
174
+ self, url: str, max_proxy_delay: timedelta = timedelta(minutes=10), timeout: int = 30_000
175
+ ) -> Response:
145
176
  """
146
177
  Ensures the browser is initialized and navigates to the given URL.
147
178
  Public wrapper for the internal retry-enabled navigation method.
@@ -149,4 +180,95 @@ class PlaywrightCrawler:
149
180
  if not self.page:
150
181
  logger.info("Browser context not found, initializing now...")
151
182
  await self.init_context()
152
- return await self._goto_with_retry(url)
183
+ return await self._goto_with_retry.retry_with(stop=stop_after_delay(max_proxy_delay))(self, url, timeout)
184
+
185
+ def _handle_http_error(self, status_code: int, url: str, response, allow_redirects: bool = True) -> None:
186
+ """
187
+ Handle HTTP errors with special handling for redirects.
188
+
189
+ Args:
190
+ status_code: HTTP status code
191
+ url: Request URL
192
+ response: Response object
193
+
194
+ Raises:
195
+ RedirectionDetectedError: If a redirect status is received
196
+ NotFoundError: For 404/410 errors
197
+ BadRequestError: For 400 errors
198
+ HTTPError: For other non-2xx status codes
199
+ """
200
+ # Check for redirect status codes
201
+ if not allow_redirects and response.request.redirected_from: # noqa: F841
202
+ raise RedirectionDetectedError(
203
+ message=f"HTTP redirect detected from {response.request.redirected_from.url} to {response.request.redirected_from.redirected_to.url}",
204
+ response=response,
205
+ )
206
+
207
+ # Standard error handlers
208
+ error_handlers = {
209
+ 404: lambda: NotFoundError(message=f"404 Not Found error for {url}", response=response),
210
+ 410: lambda: NotFoundError(message=f"410 Gone error for {url}", response=response),
211
+ 400: lambda: BadRequestError(message=f"400 Bad Request error for {url}", response=response),
212
+ }
213
+
214
+ if status_code in error_handlers:
215
+ raise error_handlers[status_code]()
216
+
217
+ # Raise for any other non-2xx status
218
+ if status_code >= 400:
219
+ raise HTTPError(f"Navigation failed: {status_code} - {url}", response=response)
220
+
221
+ @retry(
222
+ retry=retry_if_not_exception_type(
223
+ (IgnoredHTTPError, NotFoundError, BadRequestError, RedirectionDetectedError, ProxyError)
224
+ ),
225
+ wait=wait_exponential(exp_base=3, multiplier=3, max=60),
226
+ stop=stop_after_attempt(5),
227
+ before_sleep=before_sleep_log(logger, logging.WARNING),
228
+ reraise=False,
229
+ retry_error_callback=lambda rs: None,
230
+ )
231
+ async def get_data(
232
+ self,
233
+ url: str,
234
+ output: str = "json",
235
+ sleep: tuple = (6, 3),
236
+ max_proxy_delay: timedelta = timedelta(minutes=10),
237
+ ignored_status_codes: Sequence[int] = (),
238
+ timeout: int = 30_000,
239
+ **kwargs,
240
+ ):
241
+ """
242
+ Asynchronously crawls a given URL using Playwright and attempts to parse its body content.
243
+ Maintains full retry structure and output versatility.
244
+ """
245
+
246
+ params = kwargs.copy()
247
+
248
+ allow_redirects = params.get("allow_redirects", True)
249
+
250
+ logger.info(f"Fetching data from {url} ...")
251
+ r = await self.goto(url, max_proxy_delay, timeout)
252
+ await ban_sleep_async(*sleep)
253
+ body_content = await self.page.eval_on_selector("body", "body => body.innerText")
254
+
255
+ if r.status in ignored_status_codes:
256
+ raise IgnoredHTTPError(message=f"Status {r.status} in ignored_status_codes for URL {url}", response=r)
257
+
258
+ # Handle HTTP errors with redirect detection
259
+ self._handle_http_error(r.status, url, r, allow_redirects)
260
+
261
+ if not body_content:
262
+ raise EmptyResponseError(message=f"Empty body received from {url} (status {r.status})", response=r)
263
+
264
+ output_format = {
265
+ "json": lambda: json.loads(body_content),
266
+ "text": lambda: body_content,
267
+ "soup": lambda: BeautifulSoup(body_content, "html.parser"),
268
+ "response": lambda: r,
269
+ }
270
+
271
+ if output in output_format:
272
+ return output_format[output]()
273
+
274
+ raise ValueError(f"Unsupported output format: {output}")