datamarket 0.7.98__py3-none-any.whl → 0.7.100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/params/nominatim.py +1 -1
- datamarket/utils/playwright/sync_api.py +37 -12
- datamarket/utils/strings/standardization.py +12 -43
- {datamarket-0.7.98.dist-info → datamarket-0.7.100.dist-info}/METADATA +1 -1
- {datamarket-0.7.98.dist-info → datamarket-0.7.100.dist-info}/RECORD +7 -7
- {datamarket-0.7.98.dist-info → datamarket-0.7.100.dist-info}/LICENSE +0 -0
- {datamarket-0.7.98.dist-info → datamarket-0.7.100.dist-info}/WHEEL +0 -0
datamarket/params/nominatim.py
CHANGED
|
@@ -412,7 +412,7 @@ COUNTRY_PARSING_RULES = {
|
|
|
412
412
|
|
|
413
413
|
"zip_search_pattern": re.compile(r"\b\d{5}\b"),
|
|
414
414
|
|
|
415
|
-
"phone_validate_pattern": re.compile(r"^(\+?34)?[
|
|
415
|
+
"phone_validate_pattern": re.compile(r"^(\+?34)?[67]\d{8}$")
|
|
416
416
|
},
|
|
417
417
|
"pt": {
|
|
418
418
|
"zip_validate_pattern": re.compile(r"^\d{4}[- ]{0,1}\d{3}$|^\d{4}$"),
|
|
@@ -53,12 +53,14 @@ def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_
|
|
|
53
53
|
class PlaywrightCrawler:
|
|
54
54
|
"""A robust, proxy-enabled Playwright crawler with captcha bypass and retry logic."""
|
|
55
55
|
|
|
56
|
-
def __init__(self, proxy_interface: ProxyInterface):
|
|
56
|
+
def __init__(self, proxy_interface: Optional[ProxyInterface] = None):
|
|
57
57
|
"""
|
|
58
|
-
Initializes the crawler
|
|
58
|
+
Initializes the crawler.
|
|
59
59
|
|
|
60
60
|
Args:
|
|
61
|
-
proxy_interface (ProxyInterface):
|
|
61
|
+
proxy_interface (Optional[ProxyInterface], optional): Provider used to fetch
|
|
62
|
+
proxy credentials. Defaults to None. When None, no proxy is configured and
|
|
63
|
+
the browser will run without a proxy.
|
|
62
64
|
"""
|
|
63
65
|
self.proxy_interface = proxy_interface
|
|
64
66
|
self.pw: Optional[Camoufox] = None
|
|
@@ -81,6 +83,25 @@ class PlaywrightCrawler:
|
|
|
81
83
|
if self.pw:
|
|
82
84
|
self.pw.__exit__(exc_type, exc_val, exc_tb)
|
|
83
85
|
|
|
86
|
+
def _build_proxy_config(self) -> Optional[dict]:
|
|
87
|
+
"""Builds the proxy configuration dictionary.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Optional[dict]: Proxy configuration if a proxy_interface is provided; otherwise None.
|
|
91
|
+
"""
|
|
92
|
+
if not self.proxy_interface:
|
|
93
|
+
logger.info("Starting browser without proxy.")
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
|
|
97
|
+
proxy_url = f"http://{host}:{port}"
|
|
98
|
+
proxy_cfg: dict = {"server": proxy_url}
|
|
99
|
+
if user and pwd:
|
|
100
|
+
proxy_cfg.update({"username": user, "password": pwd})
|
|
101
|
+
|
|
102
|
+
logger.info(f"Starting browser with proxy: {proxy_url}")
|
|
103
|
+
return proxy_cfg
|
|
104
|
+
|
|
84
105
|
@retry(
|
|
85
106
|
wait=wait_exponential(exp_base=2, multiplier=3, max=90),
|
|
86
107
|
stop=stop_after_delay(timedelta(minutes=10)),
|
|
@@ -88,16 +109,20 @@ class PlaywrightCrawler:
|
|
|
88
109
|
reraise=True,
|
|
89
110
|
)
|
|
90
111
|
def init_context(self) -> Self:
|
|
91
|
-
"""
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
112
|
+
"""
|
|
113
|
+
Initializes a new browser instance and context.
|
|
114
|
+
|
|
115
|
+
Behavior:
|
|
116
|
+
- If a proxy_interface is provided, fetches fresh proxy credentials and starts
|
|
117
|
+
the browser using that proxy.
|
|
118
|
+
- If proxy_interface is None, starts the browser without any proxy.
|
|
96
119
|
|
|
97
|
-
|
|
98
|
-
|
|
120
|
+
Returns:
|
|
121
|
+
Self: The crawler instance with active browser, context, and page.
|
|
122
|
+
"""
|
|
123
|
+
try:
|
|
124
|
+
proxy_cfg: Optional[dict] = self._build_proxy_config()
|
|
99
125
|
|
|
100
|
-
logger.info(f"Starting browser with proxy: {proxy_url}")
|
|
101
126
|
self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
|
|
102
127
|
self.browser = self.pw.__enter__()
|
|
103
128
|
self.context = self.browser.new_context()
|
|
@@ -146,4 +171,4 @@ class PlaywrightCrawler:
|
|
|
146
171
|
if not self.page:
|
|
147
172
|
logger.info("Browser context not found, initializing now...")
|
|
148
173
|
self.init_context()
|
|
149
|
-
return self._goto_with_retry(url)
|
|
174
|
+
return self._goto_with_retry(url)
|
|
@@ -8,45 +8,6 @@ from ...params.nominatim import COUNTRY_PARSING_RULES
|
|
|
8
8
|
########################################################################################################################
|
|
9
9
|
# FUNCTIONS
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
def _standardize_es_phone_number(number: str) -> str | None:
|
|
13
|
-
"""Standardize phone numbers from Spain using regex validation.
|
|
14
|
-
|
|
15
|
-
Args:
|
|
16
|
-
number (str): cleaned, digits-only phone number
|
|
17
|
-
|
|
18
|
-
Returns:
|
|
19
|
-
str | None: standardized 9-digit phone number
|
|
20
|
-
"""
|
|
21
|
-
# Get the validation regex from params
|
|
22
|
-
pattern = COUNTRY_PARSING_RULES["es"]["phone_validate_pattern"]
|
|
23
|
-
|
|
24
|
-
# Validate and extract in one step
|
|
25
|
-
match = pattern.match(number)
|
|
26
|
-
|
|
27
|
-
# Return the captured group (the 9-digit number)
|
|
28
|
-
return match.group(1) if match else None
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def _standardize_pt_phone_number(number: str) -> str | None:
|
|
32
|
-
"""Standardize phone numbers from Portugal using regex validation.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
number (str): cleaned, digits-only phone number
|
|
36
|
-
|
|
37
|
-
Returns:
|
|
38
|
-
str | None: standardized 9-digit phone number
|
|
39
|
-
"""
|
|
40
|
-
# Get the validation regex from params
|
|
41
|
-
pattern = COUNTRY_PARSING_RULES["pt"]["phone_validate_pattern"]
|
|
42
|
-
|
|
43
|
-
# Validate and extract in one step
|
|
44
|
-
match = pattern.match(number)
|
|
45
|
-
|
|
46
|
-
# Return the captured group (the 9-digit number)
|
|
47
|
-
return match.group(1) if match else None
|
|
48
|
-
|
|
49
|
-
|
|
50
11
|
def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str | None:
|
|
51
12
|
"""Clean and standardize phone number from a certain country_code
|
|
52
13
|
|
|
@@ -61,9 +22,17 @@ def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str |
|
|
|
61
22
|
str | None: standardized phone number
|
|
62
23
|
"""
|
|
63
24
|
clean_number = re.sub(r"\D", "", number)
|
|
64
|
-
if country_code
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
25
|
+
if country_code in {"es", "pt"}:
|
|
26
|
+
# Get the validation regex from params
|
|
27
|
+
pattern = COUNTRY_PARSING_RULES[country_code]["phone_validate_pattern"]
|
|
28
|
+
|
|
29
|
+
# Validate and extract in one step
|
|
30
|
+
if len(clean_number) >= 9: # Check if the cleaned number has at least 9 digits
|
|
31
|
+
match = pattern.match(clean_number)
|
|
32
|
+
|
|
33
|
+
# Return the captured group (the 9-digit number)
|
|
34
|
+
return match.group(0)[-9:] if match else None
|
|
35
|
+
else:
|
|
36
|
+
return None # Or handle the case where the number is too short
|
|
68
37
|
else:
|
|
69
38
|
raise ValueError(f"Country code ({country_code}) is not currently supported")
|
|
@@ -12,7 +12,7 @@ datamarket/interfaces/peerdb.py,sha256=sO451wEGNb_0DDwchZ6eBVYKltqHM5XKau-WsfspX
|
|
|
12
12
|
datamarket/interfaces/proxy.py,sha256=Uu-dHvpQOLNBZPGHAanLXnKT1789ArcHfOw8exECt34,5398
|
|
13
13
|
datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
|
|
14
14
|
datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
datamarket/params/nominatim.py,sha256=
|
|
15
|
+
datamarket/params/nominatim.py,sha256=RnmYXGoJQCijOsuCavCYcxw98WvOd_vOMK4KaraI0RU,11967
|
|
16
16
|
datamarket/utils/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
|
|
17
17
|
datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
|
|
18
18
|
datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
|
|
@@ -20,16 +20,16 @@ datamarket/utils/main.py,sha256=KYHjDOps6_Q3TFV_Jj7MLj-L9Evx05AXELCvp06BARU,5857
|
|
|
20
20
|
datamarket/utils/nominatim.py,sha256=IxexKY2KOlDhiKtzsqQfoVUjJXPxJl7tn3iHUaQKg08,5795
|
|
21
21
|
datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
datamarket/utils/playwright/async_api.py,sha256=UbA2D4ScBtYeMfrRjly4RO-s8wXIub9c05J1eoOCpsQ,5782
|
|
23
|
-
datamarket/utils/playwright/sync_api.py,sha256=
|
|
23
|
+
datamarket/utils/playwright/sync_api.py,sha256=eXaZsd7xgWSYJtZv6EAstjSbS2bl9OYlkwMBfqqTbFY,6434
|
|
24
24
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
25
25
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
26
26
|
datamarket/utils/strings/__init__.py,sha256=b6TYOT9v7y9ID-lDyZk4E8BH2uIPbsF2ZSLGjCQ1MCQ,43
|
|
27
27
|
datamarket/utils/strings/normalization.py,sha256=UBluU6ABY6aCpnd02F7L7HcivVSisRJ9IUXdj9D1MyE,9050
|
|
28
28
|
datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnmH3FPlgUUjM,5246
|
|
29
|
-
datamarket/utils/strings/standardization.py,sha256=
|
|
29
|
+
datamarket/utils/strings/standardization.py,sha256=c8CAG6HI3AfK0hB3A3IGwsbnQebZ6R3PrA5PELHRXM0,1492
|
|
30
30
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
31
31
|
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
32
|
-
datamarket-0.7.
|
|
33
|
-
datamarket-0.7.
|
|
34
|
-
datamarket-0.7.
|
|
35
|
-
datamarket-0.7.
|
|
32
|
+
datamarket-0.7.100.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
33
|
+
datamarket-0.7.100.dist-info/METADATA,sha256=ZzGfCV51bIyPYJVdCSfJDdX8YuC9_BjKR1VCoRtd6yI,7382
|
|
34
|
+
datamarket-0.7.100.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
35
|
+
datamarket-0.7.100.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|