datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/__init__.py +0 -1
- datamarket/exceptions/__init__.py +1 -0
- datamarket/exceptions/main.py +118 -0
- datamarket/interfaces/alchemy.py +1934 -25
- datamarket/interfaces/aws.py +81 -14
- datamarket/interfaces/azure.py +127 -0
- datamarket/interfaces/drive.py +60 -10
- datamarket/interfaces/ftp.py +37 -14
- datamarket/interfaces/llm.py +1220 -0
- datamarket/interfaces/nominatim.py +314 -42
- datamarket/interfaces/peerdb.py +272 -104
- datamarket/interfaces/proxy.py +354 -50
- datamarket/interfaces/tinybird.py +7 -15
- datamarket/params/nominatim.py +439 -0
- datamarket/utils/__init__.py +1 -1
- datamarket/utils/airflow.py +10 -7
- datamarket/utils/alchemy.py +2 -1
- datamarket/utils/logs.py +88 -0
- datamarket/utils/main.py +138 -10
- datamarket/utils/nominatim.py +201 -0
- datamarket/utils/playwright/__init__.py +0 -0
- datamarket/utils/playwright/async_api.py +274 -0
- datamarket/utils/playwright/sync_api.py +281 -0
- datamarket/utils/requests.py +655 -0
- datamarket/utils/selenium.py +6 -12
- datamarket/utils/strings/__init__.py +1 -0
- datamarket/utils/strings/normalization.py +217 -0
- datamarket/utils/strings/obfuscation.py +153 -0
- datamarket/utils/strings/standardization.py +40 -0
- datamarket/utils/typer.py +2 -1
- datamarket/utils/types.py +1 -0
- datamarket-0.10.3.dist-info/METADATA +172 -0
- datamarket-0.10.3.dist-info/RECORD +38 -0
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
- datamarket-0.6.0.dist-info/METADATA +0 -49
- datamarket-0.6.0.dist-info/RECORD +0 -24
- datamarket-0.6.0.dist-info/top_level.txt +0 -1
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
datamarket/utils/main.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
########################################################################################################################
|
|
2
2
|
# IMPORTS
|
|
3
3
|
|
|
4
|
+
import asyncio
|
|
4
5
|
import configparser
|
|
5
6
|
import logging
|
|
6
7
|
import random
|
|
@@ -8,8 +9,13 @@ import re
|
|
|
8
9
|
import shlex
|
|
9
10
|
import subprocess
|
|
10
11
|
import time
|
|
12
|
+
from datetime import timedelta
|
|
13
|
+
from typing import Sequence, overload
|
|
11
14
|
|
|
12
15
|
import pendulum
|
|
16
|
+
from babel.numbers import parse_decimal
|
|
17
|
+
|
|
18
|
+
from ..interfaces.proxy import ProxyInterface
|
|
13
19
|
|
|
14
20
|
########################################################################################################################
|
|
15
21
|
# FUNCTIONS
|
|
@@ -33,12 +39,66 @@ def set_logger(level):
|
|
|
33
39
|
log.addHandler(ch)
|
|
34
40
|
|
|
35
41
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
42
|
+
@overload
|
|
43
|
+
def ban_sleep(max_time: float) -> None: ...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@overload
|
|
47
|
+
def ban_sleep(min_time: float, max_time: float) -> None: ...
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def ban_sleep(x: float, y: float | None = None) -> None:
|
|
51
|
+
"""
|
|
52
|
+
Sleep for a random number of seconds.
|
|
53
|
+
|
|
54
|
+
Usage:
|
|
55
|
+
ban_sleep(5) -> sleeps ~N(5, 2.5²) seconds, truncated to >= 0
|
|
56
|
+
ban_sleep(3, 7) -> sleeps uniformly between 3 and 7 seconds
|
|
57
|
+
ban_sleep(7, 3) -> same as above (order doesn't matter)
|
|
58
|
+
"""
|
|
59
|
+
if y is None:
|
|
60
|
+
mean = float(x)
|
|
61
|
+
std_dev = mean / 2.0
|
|
62
|
+
sleep_time = random.gauss(mean, std_dev) # noqa: S311
|
|
63
|
+
sleep_time = max(0.0, sleep_time)
|
|
64
|
+
else:
|
|
65
|
+
x, y = sorted([float(x), float(y)])
|
|
66
|
+
sleep_time = random.uniform(x, y) # noqa: S311
|
|
67
|
+
|
|
68
|
+
logger.info(f"sleeping for {sleep_time:.2f} seconds...")
|
|
39
69
|
time.sleep(sleep_time)
|
|
40
70
|
|
|
41
71
|
|
|
72
|
+
@overload
|
|
73
|
+
async def ban_sleep_async(seconds: float) -> None: ...
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@overload
|
|
77
|
+
async def ban_sleep_async(min_time: float, max_time: float) -> None: ...
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def ban_sleep_async(min_time: float, max_time: float | None = None) -> None:
|
|
81
|
+
"""
|
|
82
|
+
Asynchronous sleep for a random number of seconds.
|
|
83
|
+
|
|
84
|
+
Usage:
|
|
85
|
+
await ban_sleep_async(5) # sleeps ~N(5, (5/2)²) seconds, truncated to >= 0
|
|
86
|
+
await ban_sleep_async(3, 7) # sleeps uniformly between 3 and 7 seconds
|
|
87
|
+
await ban_sleep_async(7, 3) # same as above (order doesn't matter)
|
|
88
|
+
"""
|
|
89
|
+
if max_time is None:
|
|
90
|
+
mean = float(min_time)
|
|
91
|
+
std_dev = mean / 2.0
|
|
92
|
+
sleep_time = random.gauss(mean, std_dev) # noqa: S311
|
|
93
|
+
sleep_time = max(0.0, sleep_time)
|
|
94
|
+
else:
|
|
95
|
+
min_time, max_time = sorted([float(min_time), float(max_time)])
|
|
96
|
+
sleep_time = random.uniform(min_time, max_time) # noqa: S311
|
|
97
|
+
|
|
98
|
+
logger.info(f"sleeping for {sleep_time:.2f} seconds...")
|
|
99
|
+
await asyncio.sleep(sleep_time)
|
|
100
|
+
|
|
101
|
+
|
|
42
102
|
def run_bash_command(command):
|
|
43
103
|
p = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
44
104
|
|
|
@@ -57,14 +117,27 @@ def run_bash_command(command):
|
|
|
57
117
|
|
|
58
118
|
def text_to_int(text):
|
|
59
119
|
max_int32 = 2147483647
|
|
120
|
+
parsed_str = re.sub(r"[^\d]", "", text)
|
|
121
|
+
if parsed_str:
|
|
122
|
+
num = int(parsed_str)
|
|
123
|
+
else:
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
if -max_int32 < num < max_int32:
|
|
127
|
+
return num
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def text_to_float(text: str | None, locale: str = "es_ES") -> float | None:
|
|
131
|
+
if not text:
|
|
132
|
+
return None
|
|
133
|
+
match = re.search(r"\d(?:[\d\s.,]*\d)?", text)
|
|
134
|
+
if not match:
|
|
135
|
+
return None
|
|
136
|
+
number_str = match.group(0).replace(" ", "")
|
|
60
137
|
try:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
return num
|
|
65
|
-
|
|
66
|
-
except ValueError:
|
|
67
|
-
logger.warning(f"unable to parse {text} as integer")
|
|
138
|
+
return float(parse_decimal(number_str, locale=locale))
|
|
139
|
+
except Exception:
|
|
140
|
+
return None
|
|
68
141
|
|
|
69
142
|
|
|
70
143
|
def sleep_out_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
|
|
@@ -92,3 +165,58 @@ def parse_field(dict_struct, field_path, format_method=None):
|
|
|
92
165
|
if field_value is None:
|
|
93
166
|
return None
|
|
94
167
|
return format_method(field_value) if format_method else field_value
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def get_data(
|
|
171
|
+
url: str,
|
|
172
|
+
method: str = "GET",
|
|
173
|
+
output: str = "json",
|
|
174
|
+
sleep: tuple = (6, 3),
|
|
175
|
+
proxy_interface: ProxyInterface = None,
|
|
176
|
+
use_auth_proxies: bool = False,
|
|
177
|
+
max_proxy_delay: timedelta = timedelta(minutes=10),
|
|
178
|
+
ignored_status_codes: Sequence[int] = (),
|
|
179
|
+
**kwargs,
|
|
180
|
+
):
|
|
181
|
+
"""
|
|
182
|
+
Fetches data from a given URL using HTTP requests, with support for proxy configuration, retries, and flexible output formats.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
url (str): The target URL to fetch data from.
|
|
186
|
+
method (str, optional): HTTP method to use (e.g., 'GET', 'POST'). Defaults to 'GET'.
|
|
187
|
+
output (str, optional): Output format ('json', 'text', 'soup', 'response'). Defaults to 'json'.
|
|
188
|
+
sleep (tuple, optional): Tuple specifying max and min sleep times (seconds) after request. Defaults to (6, 3).
|
|
189
|
+
use_auth_proxies (bool, optional): Whether to use authenticated proxies. Defaults to False.
|
|
190
|
+
max_proxy_delay (timedelta, optional): Maximum delay for proxy retry logic. Defaults to 10 minutes.
|
|
191
|
+
ignored_status_codes (Sequence[int], optional): Status codes to ignore and return response for. Defaults to ().
|
|
192
|
+
**kwargs: Additional arguments passed to the requests method (timeout defaults to 30 seconds if not specified).
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Depends on the 'output' argument:
|
|
196
|
+
- 'json': Parsed JSON response.
|
|
197
|
+
- 'text': Response text.
|
|
198
|
+
- 'soup': BeautifulSoup-parsed HTML.
|
|
199
|
+
- 'response': Raw requests.Response object.
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
IgnoredHTTPError: If a response status code is in `ignored_status_codes`.
|
|
203
|
+
NotFoundError: If a 404 or 410 status code is returned and not in `ignored_status_codes`.
|
|
204
|
+
BadRequestError: If a 400 status code is returned and not in `ignored_status_codes`.
|
|
205
|
+
EmptyResponseError: If the response has no content.
|
|
206
|
+
ProxyError: On proxy-related errors.
|
|
207
|
+
requests.HTTPError: For other HTTP errors if not ignored.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
from .requests import RequestsClient
|
|
211
|
+
|
|
212
|
+
client = RequestsClient(proxy_interface)
|
|
213
|
+
return client.get_data(
|
|
214
|
+
url=url,
|
|
215
|
+
method=method,
|
|
216
|
+
output=output,
|
|
217
|
+
sleep=sleep,
|
|
218
|
+
use_auth_proxies=use_auth_proxies,
|
|
219
|
+
max_proxy_delay=max_proxy_delay,
|
|
220
|
+
ignored_status_codes=ignored_status_codes,
|
|
221
|
+
**kwargs,
|
|
222
|
+
)
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
from typing import Literal, Optional
|
|
5
|
+
|
|
6
|
+
from rapidfuzz import fuzz, process
|
|
7
|
+
from unidecode import unidecode
|
|
8
|
+
|
|
9
|
+
from ..params.nominatim import (
|
|
10
|
+
_NORMALIZED_PROVINCE_CACHE,
|
|
11
|
+
COUNTRY_PARSING_RULES,
|
|
12
|
+
POSTCODE_TO_STATES,
|
|
13
|
+
PROVINCE_TO_POSTCODE,
|
|
14
|
+
PROVINCES,
|
|
15
|
+
STANDARD_THRESHOLD,
|
|
16
|
+
STATES,
|
|
17
|
+
)
|
|
18
|
+
from .strings import normalize
|
|
19
|
+
|
|
20
|
+
########################################################################################################################
|
|
21
|
+
# FUNCTIONS
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def standardize_admin_division(
|
|
25
|
+
name: str,
|
|
26
|
+
level: Literal["province", "state"] = "province",
|
|
27
|
+
country_code: str = "es",
|
|
28
|
+
) -> Optional[str]:
|
|
29
|
+
"""
|
|
30
|
+
Normalize and standardize administrative divisions of a given country using RapidFuzz.
|
|
31
|
+
Uses normalized dict keys for comparison and returns dict values with the official names.
|
|
32
|
+
"""
|
|
33
|
+
if not name:
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
country_code = country_code.lower()
|
|
37
|
+
mapping = STATES.get(country_code) if level == "state" else PROVINCES.get(country_code)
|
|
38
|
+
|
|
39
|
+
if not mapping: # If country is not standardized, return raw name
|
|
40
|
+
return name
|
|
41
|
+
|
|
42
|
+
normalized_name = normalize(name) # Essential for rapidfuzz to work well
|
|
43
|
+
result = process.extractOne(
|
|
44
|
+
normalized_name,
|
|
45
|
+
mapping.keys(), # Compare with the normalized names in the dict
|
|
46
|
+
scorer=fuzz.WRatio,
|
|
47
|
+
score_cutoff=STANDARD_THRESHOLD,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if not result:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
best_key, score, _ = result
|
|
54
|
+
|
|
55
|
+
# Return the standardized name corresponding to the normalized name
|
|
56
|
+
return mapping[best_key]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def parse_state(
|
|
60
|
+
zip_code: str,
|
|
61
|
+
country_code: str,
|
|
62
|
+
) -> str | None:
|
|
63
|
+
"""Given a zip code and a country code, returns the state in which the zip code is located
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
zip_code (str)
|
|
67
|
+
country_code (str)
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
str | None: state if coincidence found, else None
|
|
71
|
+
"""
|
|
72
|
+
country_postcodes = POSTCODE_TO_STATES.get(country_code, {})
|
|
73
|
+
state = country_postcodes.get(zip_code[:2], None)
|
|
74
|
+
return state
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _province_postcode_match(
|
|
78
|
+
address: str,
|
|
79
|
+
zip_code: str,
|
|
80
|
+
country_code: str,
|
|
81
|
+
) -> str | None:
|
|
82
|
+
"""
|
|
83
|
+
Match and return province with the start of all of its zip codes
|
|
84
|
+
using a pre-computed cache and rapidfuzz for efficient matching.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
address (str)
|
|
88
|
+
zip_code (str)
|
|
89
|
+
country_code (str)
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
str | None:
|
|
93
|
+
"""
|
|
94
|
+
# Get the pre-computed cache for the country
|
|
95
|
+
cache = _NORMALIZED_PROVINCE_CACHE.get(country_code)
|
|
96
|
+
if not cache:
|
|
97
|
+
return None # Country not configured
|
|
98
|
+
|
|
99
|
+
normalized_address = unidecode(address).lower()
|
|
100
|
+
|
|
101
|
+
# Use the cached 'choices' list for the search
|
|
102
|
+
result = process.extractOne(
|
|
103
|
+
normalized_address,
|
|
104
|
+
cache["choices"], # <-- Uses pre-computed list
|
|
105
|
+
scorer=fuzz.partial_ratio,
|
|
106
|
+
score_cutoff=100,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if not result:
|
|
110
|
+
return None # No exact substring match found
|
|
111
|
+
|
|
112
|
+
# We only need the index from the result
|
|
113
|
+
_, _, index = result
|
|
114
|
+
|
|
115
|
+
# Get the original province name from the cached 'keys' list
|
|
116
|
+
original_province = cache["keys"][index] # <-- Uses pre-computed list
|
|
117
|
+
|
|
118
|
+
# Get the postcode prefix from the original map
|
|
119
|
+
province_map = PROVINCE_TO_POSTCODE.get(country_code, {})
|
|
120
|
+
postcode_prefix = province_map[original_province]
|
|
121
|
+
|
|
122
|
+
return postcode_prefix + zip_code[1:] if len(zip_code) == 4 else zip_code
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _parse_es_zip_code(
|
|
126
|
+
zip_code: str,
|
|
127
|
+
address: str,
|
|
128
|
+
opt_address: str | None,
|
|
129
|
+
) -> str:
|
|
130
|
+
"""parse spain zip code"""
|
|
131
|
+
|
|
132
|
+
# Get the validation regex from params
|
|
133
|
+
validate_regex = COUNTRY_PARSING_RULES["es"]["zip_validate_pattern"]
|
|
134
|
+
|
|
135
|
+
if validate_regex.match(zip_code):
|
|
136
|
+
return zip_code
|
|
137
|
+
else:
|
|
138
|
+
# Use search regex from params
|
|
139
|
+
pattern = COUNTRY_PARSING_RULES["es"]["zip_search_pattern"]
|
|
140
|
+
|
|
141
|
+
match = pattern.search(address)
|
|
142
|
+
if match:
|
|
143
|
+
return match.group()
|
|
144
|
+
if opt_address:
|
|
145
|
+
match = pattern.search(opt_address)
|
|
146
|
+
if match:
|
|
147
|
+
return match.group()
|
|
148
|
+
|
|
149
|
+
province_match = _province_postcode_match(address, zip_code, country_code="es")
|
|
150
|
+
return province_match or zip_code
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _parse_pt_zip_code(
|
|
154
|
+
zip_code: str,
|
|
155
|
+
address: str,
|
|
156
|
+
opt_address: str | None,
|
|
157
|
+
) -> str:
|
|
158
|
+
"""parse portugal zip code"""
|
|
159
|
+
|
|
160
|
+
# Get the validation regex from params
|
|
161
|
+
validate_regex = COUNTRY_PARSING_RULES["pt"]["zip_validate_pattern"]
|
|
162
|
+
|
|
163
|
+
if validate_regex.match(zip_code):
|
|
164
|
+
return zip_code
|
|
165
|
+
else:
|
|
166
|
+
# Use search regex from params
|
|
167
|
+
pattern = COUNTRY_PARSING_RULES["pt"]["zip_search_pattern"]
|
|
168
|
+
|
|
169
|
+
match = pattern.search(address)
|
|
170
|
+
if match is None and opt_address:
|
|
171
|
+
match = pattern.search(opt_address)
|
|
172
|
+
|
|
173
|
+
return match.group() if match else zip_code
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def parse_zip_code(
|
|
177
|
+
address: str,
|
|
178
|
+
zip_code: str,
|
|
179
|
+
country_code: str,
|
|
180
|
+
opt_address: str | None = None,
|
|
181
|
+
) -> str | None:
|
|
182
|
+
"""Parse and standardize zip code
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
address (str): written address
|
|
186
|
+
zip_code (str)
|
|
187
|
+
country_code (str):
|
|
188
|
+
opt_address (str | None, optional): optional extra address, usually None. Defaults to None.
|
|
189
|
+
|
|
190
|
+
Raises:
|
|
191
|
+
ValueError: when parsing zip code is not supported for the passed country_code
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
str | None
|
|
195
|
+
"""
|
|
196
|
+
if country_code == "es":
|
|
197
|
+
return _parse_es_zip_code(zip_code, address, opt_address)
|
|
198
|
+
elif country_code == "pt":
|
|
199
|
+
return _parse_pt_zip_code(zip_code, address, opt_address)
|
|
200
|
+
else:
|
|
201
|
+
raise ValueError(f"Country code ({country_code}) is not currently supported")
|
|
File without changes
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from datetime import timedelta
|
|
8
|
+
from random import randint
|
|
9
|
+
from types import TracebackType
|
|
10
|
+
from typing import Optional, Self, Sequence
|
|
11
|
+
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
13
|
+
from camoufox.async_api import AsyncCamoufox as Camoufox
|
|
14
|
+
from playwright.async_api import Browser, BrowserContext, Page, Response
|
|
15
|
+
from playwright.async_api import (
|
|
16
|
+
Error as PlaywrightError,
|
|
17
|
+
)
|
|
18
|
+
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
|
19
|
+
from requests.exceptions import HTTPError, ProxyError
|
|
20
|
+
from tenacity import (
|
|
21
|
+
before_sleep_log,
|
|
22
|
+
retry,
|
|
23
|
+
retry_if_exception_type,
|
|
24
|
+
retry_if_not_exception_type,
|
|
25
|
+
stop_after_attempt,
|
|
26
|
+
stop_after_delay,
|
|
27
|
+
wait_exponential,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
from datamarket.exceptions import BadRequestError, EmptyResponseError, NotFoundError, RedirectionDetectedError
|
|
31
|
+
from datamarket.exceptions.main import IgnoredHTTPError
|
|
32
|
+
from datamarket.interfaces.proxy import ProxyInterface
|
|
33
|
+
from datamarket.utils.main import ban_sleep_async
|
|
34
|
+
|
|
35
|
+
########################################################################################################################
|
|
36
|
+
# SETUP LOGGER
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
########################################################################################################################
|
|
41
|
+
# ASYNC HELPER FUNCTIONS
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def human_type(page: Page, text: str, delay: int = 100):
|
|
45
|
+
for char in text:
|
|
46
|
+
await page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_sleep: bool = True) -> None:
|
|
50
|
+
"""Asynchronously presses a key with a random delay, optionally sleeping between presses."""
|
|
51
|
+
for _ in range(count):
|
|
52
|
+
await page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
|
|
53
|
+
if add_sleep:
|
|
54
|
+
await asyncio.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000) # noqa: S311
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
########################################################################################################################
|
|
58
|
+
# ASYNC CRAWLER CLASS
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class PlaywrightCrawler:
|
|
62
|
+
"""An robust, proxy-enabled asynchronous Playwright crawler with captcha bypass and retry logic."""
|
|
63
|
+
|
|
64
|
+
_REDIRECT_STATUS_CODES = set(range(300, 309))
|
|
65
|
+
|
|
66
|
+
def __init__(self, proxy_interface: Optional[ProxyInterface] = None):
|
|
67
|
+
"""
|
|
68
|
+
Initializes the async crawler.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
proxy_interface (Optional[ProxyInterface], optional): Provider used to fetch
|
|
72
|
+
proxy credentials. Defaults to None. When None, no proxy is configured and
|
|
73
|
+
the browser will run without a proxy.
|
|
74
|
+
"""
|
|
75
|
+
self.proxy_interface = proxy_interface
|
|
76
|
+
self.pw: Optional[Camoufox] = None
|
|
77
|
+
self.browser: Optional[Browser] = None
|
|
78
|
+
self.context: Optional[BrowserContext] = None
|
|
79
|
+
self.page: Optional[Page] = None
|
|
80
|
+
|
|
81
|
+
async def __aenter__(self) -> Self:
|
|
82
|
+
"""Initializes the browser context when entering the `async with` statement."""
|
|
83
|
+
await self.init_context()
|
|
84
|
+
return self
|
|
85
|
+
|
|
86
|
+
async def __aexit__(
|
|
87
|
+
self,
|
|
88
|
+
exc_type: Optional[type[BaseException]],
|
|
89
|
+
exc_val: Optional[BaseException],
|
|
90
|
+
exc_tb: Optional[TracebackType],
|
|
91
|
+
) -> None:
|
|
92
|
+
"""Safely closes the browser context upon exit."""
|
|
93
|
+
if self.pw:
|
|
94
|
+
await self.pw.__aexit__(exc_type, exc_val, exc_tb)
|
|
95
|
+
|
|
96
|
+
async def _build_proxy_config(self) -> Optional[dict]:
|
|
97
|
+
"""Builds the proxy configuration dictionary.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Optional[dict]: Proxy configuration if a proxy_interface is provided; otherwise None.
|
|
101
|
+
"""
|
|
102
|
+
if not self.proxy_interface:
|
|
103
|
+
logger.info("Starting browser without proxy.")
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
host, port, user, pwd = await asyncio.to_thread(self.proxy_interface.get_proxies, raw=True, use_auth=True)
|
|
107
|
+
proxy_url = f"http://{host}:{port}"
|
|
108
|
+
proxy_cfg: dict = {"server": proxy_url}
|
|
109
|
+
if user and pwd:
|
|
110
|
+
proxy_cfg.update({"username": user, "password": pwd})
|
|
111
|
+
|
|
112
|
+
logger.info(f"Starting browser with proxy: {proxy_url}")
|
|
113
|
+
return proxy_cfg
|
|
114
|
+
|
|
115
|
+
@retry(
|
|
116
|
+
wait=wait_exponential(exp_base=2, multiplier=3, max=90),
|
|
117
|
+
stop=stop_after_delay(timedelta(minutes=10)),
|
|
118
|
+
before_sleep=before_sleep_log(logger, logging.INFO),
|
|
119
|
+
reraise=True,
|
|
120
|
+
)
|
|
121
|
+
async def init_context(self) -> Self:
|
|
122
|
+
"""
|
|
123
|
+
Initializes a new async browser instance and context.
|
|
124
|
+
|
|
125
|
+
Behavior:
|
|
126
|
+
- If a proxy_interface is provided, fetches fresh proxy credentials and starts
|
|
127
|
+
the browser using that proxy.
|
|
128
|
+
- If proxy_interface is None, starts the browser without any proxy.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Self: The crawler instance with active browser, context, and page.
|
|
132
|
+
"""
|
|
133
|
+
try:
|
|
134
|
+
proxy_cfg: Optional[dict] = await self._build_proxy_config()
|
|
135
|
+
|
|
136
|
+
self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
|
|
137
|
+
self.browser = await self.pw.__aenter__()
|
|
138
|
+
self.context = await self.browser.new_context()
|
|
139
|
+
self.page = await self.context.new_page()
|
|
140
|
+
except Exception as e:
|
|
141
|
+
logger.error(f"Failed to initialize browser context: {e}")
|
|
142
|
+
if self.pw:
|
|
143
|
+
await self.pw.__aexit__(type(e), e, e.__traceback__)
|
|
144
|
+
raise
|
|
145
|
+
return self
|
|
146
|
+
|
|
147
|
+
async def restart_context(self) -> None:
|
|
148
|
+
"""Closes the current browser instance and initializes a new one."""
|
|
149
|
+
logger.info("Restarting browser context...")
|
|
150
|
+
if self.pw:
|
|
151
|
+
await self.pw.__aexit__(None, None, None)
|
|
152
|
+
await self.init_context()
|
|
153
|
+
|
|
154
|
+
@retry(
|
|
155
|
+
retry=retry_if_exception_type((PlaywrightTimeoutError, PlaywrightError)),
|
|
156
|
+
wait=wait_exponential(exp_base=2, multiplier=3, max=90),
|
|
157
|
+
stop=stop_after_delay(timedelta(minutes=10)),
|
|
158
|
+
before_sleep=before_sleep_log(logger, logging.INFO),
|
|
159
|
+
reraise=True,
|
|
160
|
+
)
|
|
161
|
+
async def _goto_with_retry(self, url: str, timeout: int = 30_000) -> Response:
|
|
162
|
+
"""
|
|
163
|
+
Asynchronously navigates to a URL with retries for common Playwright errors.
|
|
164
|
+
Restarts the browser context on repeated failures.
|
|
165
|
+
"""
|
|
166
|
+
if not (self.page and not self.page.is_closed()):
|
|
167
|
+
logger.warning("Page is not available or closed. Restarting context.")
|
|
168
|
+
await self.restart_context()
|
|
169
|
+
|
|
170
|
+
response = await self.page.goto(url, timeout=timeout, wait_until="domcontentloaded")
|
|
171
|
+
return response
|
|
172
|
+
|
|
173
|
+
async def goto(
|
|
174
|
+
self, url: str, max_proxy_delay: timedelta = timedelta(minutes=10), timeout: int = 30_000
|
|
175
|
+
) -> Response:
|
|
176
|
+
"""
|
|
177
|
+
Ensures the browser is initialized and navigates to the given URL.
|
|
178
|
+
Public wrapper for the internal retry-enabled navigation method.
|
|
179
|
+
"""
|
|
180
|
+
if not self.page:
|
|
181
|
+
logger.info("Browser context not found, initializing now...")
|
|
182
|
+
await self.init_context()
|
|
183
|
+
return await self._goto_with_retry.retry_with(stop=stop_after_delay(max_proxy_delay))(self, url, timeout)
|
|
184
|
+
|
|
185
|
+
def _handle_http_error(self, status_code: int, url: str, response, allow_redirects: bool = True) -> None:
|
|
186
|
+
"""
|
|
187
|
+
Handle HTTP errors with special handling for redirects.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
status_code: HTTP status code
|
|
191
|
+
url: Request URL
|
|
192
|
+
response: Response object
|
|
193
|
+
|
|
194
|
+
Raises:
|
|
195
|
+
RedirectionDetectedError: If a redirect status is received
|
|
196
|
+
NotFoundError: For 404/410 errors
|
|
197
|
+
BadRequestError: For 400 errors
|
|
198
|
+
HTTPError: For other non-2xx status codes
|
|
199
|
+
"""
|
|
200
|
+
# Check for redirect status codes
|
|
201
|
+
if not allow_redirects and response.request.redirected_from: # noqa: F841
|
|
202
|
+
raise RedirectionDetectedError(
|
|
203
|
+
message=f"HTTP redirect detected from {response.request.redirected_from.url} to {response.request.redirected_from.redirected_to.url}",
|
|
204
|
+
response=response,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Standard error handlers
|
|
208
|
+
error_handlers = {
|
|
209
|
+
404: lambda: NotFoundError(message=f"404 Not Found error for {url}", response=response),
|
|
210
|
+
410: lambda: NotFoundError(message=f"410 Gone error for {url}", response=response),
|
|
211
|
+
400: lambda: BadRequestError(message=f"400 Bad Request error for {url}", response=response),
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if status_code in error_handlers:
|
|
215
|
+
raise error_handlers[status_code]()
|
|
216
|
+
|
|
217
|
+
# Raise for any other non-2xx status
|
|
218
|
+
if status_code >= 400:
|
|
219
|
+
raise HTTPError(f"Navigation failed: {status_code} - {url}", response=response)
|
|
220
|
+
|
|
221
|
+
@retry(
|
|
222
|
+
retry=retry_if_not_exception_type(
|
|
223
|
+
(IgnoredHTTPError, NotFoundError, BadRequestError, RedirectionDetectedError, ProxyError)
|
|
224
|
+
),
|
|
225
|
+
wait=wait_exponential(exp_base=3, multiplier=3, max=60),
|
|
226
|
+
stop=stop_after_attempt(5),
|
|
227
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
228
|
+
reraise=False,
|
|
229
|
+
retry_error_callback=lambda rs: None,
|
|
230
|
+
)
|
|
231
|
+
async def get_data(
|
|
232
|
+
self,
|
|
233
|
+
url: str,
|
|
234
|
+
output: str = "json",
|
|
235
|
+
sleep: tuple = (6, 3),
|
|
236
|
+
max_proxy_delay: timedelta = timedelta(minutes=10),
|
|
237
|
+
ignored_status_codes: Sequence[int] = (),
|
|
238
|
+
timeout: int = 30_000,
|
|
239
|
+
**kwargs,
|
|
240
|
+
):
|
|
241
|
+
"""
|
|
242
|
+
Asynchronously crawls a given URL using Playwright and attempts to parse its body content.
|
|
243
|
+
Maintains full retry structure and output versatility.
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
params = kwargs.copy()
|
|
247
|
+
|
|
248
|
+
allow_redirects = params.get("allow_redirects", True)
|
|
249
|
+
|
|
250
|
+
logger.info(f"Fetching data from {url} ...")
|
|
251
|
+
r = await self.goto(url, max_proxy_delay, timeout)
|
|
252
|
+
await ban_sleep_async(*sleep)
|
|
253
|
+
body_content = await self.page.eval_on_selector("body", "body => body.innerText")
|
|
254
|
+
|
|
255
|
+
if r.status in ignored_status_codes:
|
|
256
|
+
raise IgnoredHTTPError(message=f"Status {r.status} in ignored_status_codes for URL {url}", response=r)
|
|
257
|
+
|
|
258
|
+
# Handle HTTP errors with redirect detection
|
|
259
|
+
self._handle_http_error(r.status, url, r, allow_redirects)
|
|
260
|
+
|
|
261
|
+
if not body_content:
|
|
262
|
+
raise EmptyResponseError(message=f"Empty body received from {url} (status {r.status})", response=r)
|
|
263
|
+
|
|
264
|
+
output_format = {
|
|
265
|
+
"json": lambda: json.loads(body_content),
|
|
266
|
+
"text": lambda: body_content,
|
|
267
|
+
"soup": lambda: BeautifulSoup(body_content, "html.parser"),
|
|
268
|
+
"response": lambda: r,
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if output in output_format:
|
|
272
|
+
return output_format[output]()
|
|
273
|
+
|
|
274
|
+
raise ValueError(f"Unsupported output format: {output}")
|