datamarket 0.7.98__py3-none-any.whl → 0.7.100__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -412,7 +412,7 @@ COUNTRY_PARSING_RULES = {
412
412
 
413
413
  "zip_search_pattern": re.compile(r"\b\d{5}\b"),
414
414
 
415
- "phone_validate_pattern": re.compile(r"^(\+?34)?[6|7]\d{8}$")
415
+ "phone_validate_pattern": re.compile(r"^(\+?34)?[67]\d{8}$")
416
416
  },
417
417
  "pt": {
418
418
  "zip_validate_pattern": re.compile(r"^\d{4}[- ]{0,1}\d{3}$|^\d{4}$"),
@@ -53,12 +53,14 @@ def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_
53
53
  class PlaywrightCrawler:
54
54
  """A robust, proxy-enabled Playwright crawler with captcha bypass and retry logic."""
55
55
 
56
- def __init__(self, proxy_interface: ProxyInterface):
56
+ def __init__(self, proxy_interface: Optional[ProxyInterface] = None):
57
57
  """
58
- Initializes the crawler with a proxy interface.
58
+ Initializes the crawler.
59
59
 
60
60
  Args:
61
- proxy_interface (ProxyInterface): An object to fetch proxy credentials.
61
+ proxy_interface (Optional[ProxyInterface], optional): Provider used to fetch
62
+ proxy credentials. Defaults to None. When None, no proxy is configured and
63
+ the browser will run without a proxy.
62
64
  """
63
65
  self.proxy_interface = proxy_interface
64
66
  self.pw: Optional[Camoufox] = None
@@ -81,6 +83,25 @@ class PlaywrightCrawler:
81
83
  if self.pw:
82
84
  self.pw.__exit__(exc_type, exc_val, exc_tb)
83
85
 
86
+ def _build_proxy_config(self) -> Optional[dict]:
87
+ """Builds the proxy configuration dictionary.
88
+
89
+ Returns:
90
+ Optional[dict]: Proxy configuration if a proxy_interface is provided; otherwise None.
91
+ """
92
+ if not self.proxy_interface:
93
+ logger.info("Starting browser without proxy.")
94
+ return None
95
+
96
+ host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
97
+ proxy_url = f"http://{host}:{port}"
98
+ proxy_cfg: dict = {"server": proxy_url}
99
+ if user and pwd:
100
+ proxy_cfg.update({"username": user, "password": pwd})
101
+
102
+ logger.info(f"Starting browser with proxy: {proxy_url}")
103
+ return proxy_cfg
104
+
84
105
  @retry(
85
106
  wait=wait_exponential(exp_base=2, multiplier=3, max=90),
86
107
  stop=stop_after_delay(timedelta(minutes=10)),
@@ -88,16 +109,20 @@ class PlaywrightCrawler:
88
109
  reraise=True,
89
110
  )
90
111
  def init_context(self) -> Self:
91
- """Initializes a new browser instance and context with a fresh proxy."""
92
- try:
93
- host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
94
- proxy_url = f"http://{host}:{port}"
95
- proxy_cfg = {"server": proxy_url}
112
+ """
113
+ Initializes a new browser instance and context.
114
+
115
+ Behavior:
116
+ - If a proxy_interface is provided, fetches fresh proxy credentials and starts
117
+ the browser using that proxy.
118
+ - If proxy_interface is None, starts the browser without any proxy.
96
119
 
97
- if user and pwd:
98
- proxy_cfg.update({"username": user, "password": pwd})
120
+ Returns:
121
+ Self: The crawler instance with active browser, context, and page.
122
+ """
123
+ try:
124
+ proxy_cfg: Optional[dict] = self._build_proxy_config()
99
125
 
100
- logger.info(f"Starting browser with proxy: {proxy_url}")
101
126
  self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
102
127
  self.browser = self.pw.__enter__()
103
128
  self.context = self.browser.new_context()
@@ -146,4 +171,4 @@ class PlaywrightCrawler:
146
171
  if not self.page:
147
172
  logger.info("Browser context not found, initializing now...")
148
173
  self.init_context()
149
- return self._goto_with_retry(url)
174
+ return self._goto_with_retry(url)
@@ -8,45 +8,6 @@ from ...params.nominatim import COUNTRY_PARSING_RULES
8
8
  ########################################################################################################################
9
9
  # FUNCTIONS
10
10
 
11
-
12
- def _standardize_es_phone_number(number: str) -> str | None:
13
- """Standardize phone numbers from Spain using regex validation.
14
-
15
- Args:
16
- number (str): cleaned, digits-only phone number
17
-
18
- Returns:
19
- str | None: standardized 9-digit phone number
20
- """
21
- # Get the validation regex from params
22
- pattern = COUNTRY_PARSING_RULES["es"]["phone_validate_pattern"]
23
-
24
- # Validate and extract in one step
25
- match = pattern.match(number)
26
-
27
- # Return the captured group (the 9-digit number)
28
- return match.group(1) if match else None
29
-
30
-
31
- def _standardize_pt_phone_number(number: str) -> str | None:
32
- """Standardize phone numbers from Portugal using regex validation.
33
-
34
- Args:
35
- number (str): cleaned, digits-only phone number
36
-
37
- Returns:
38
- str | None: standardized 9-digit phone number
39
- """
40
- # Get the validation regex from params
41
- pattern = COUNTRY_PARSING_RULES["pt"]["phone_validate_pattern"]
42
-
43
- # Validate and extract in one step
44
- match = pattern.match(number)
45
-
46
- # Return the captured group (the 9-digit number)
47
- return match.group(1) if match else None
48
-
49
-
50
11
  def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str | None:
51
12
  """Clean and standardize phone number from a certain country_code
52
13
 
@@ -61,9 +22,17 @@ def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str |
61
22
  str | None: standardized phone number
62
23
  """
63
24
  clean_number = re.sub(r"\D", "", number)
64
- if country_code == "es":
65
- return _standardize_es_phone_number(clean_number)
66
- elif country_code == "pt":
67
- return _standardize_pt_phone_number(clean_number)
25
+ if country_code in {"es", "pt"}:
26
+ # Get the validation regex from params
27
+ pattern = COUNTRY_PARSING_RULES[country_code]["phone_validate_pattern"]
28
+
29
+ # Validate and extract in one step
30
+ if len(clean_number) >= 9: # Check if the cleaned number has at least 9 digits
31
+ match = pattern.match(clean_number)
32
+
33
+ # Return the captured group (the 9-digit number)
34
+ return match.group(0)[-9:] if match else None
35
+ else:
36
+ return None # Or handle the case where the number is too short
68
37
  else:
69
38
  raise ValueError(f"Country code ({country_code}) is not currently supported")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.98
3
+ Version: 0.7.100
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -12,7 +12,7 @@ datamarket/interfaces/peerdb.py,sha256=sO451wEGNb_0DDwchZ6eBVYKltqHM5XKau-WsfspX
12
12
  datamarket/interfaces/proxy.py,sha256=Uu-dHvpQOLNBZPGHAanLXnKT1789ArcHfOw8exECt34,5398
13
13
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
14
14
  datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- datamarket/params/nominatim.py,sha256=Xl0mBls_Yz7y8nU6-boNTMDYsxvJRIFpZyMT2gLOqvs,11968
15
+ datamarket/params/nominatim.py,sha256=RnmYXGoJQCijOsuCavCYcxw98WvOd_vOMK4KaraI0RU,11967
16
16
  datamarket/utils/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
17
17
  datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
18
18
  datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
@@ -20,16 +20,16 @@ datamarket/utils/main.py,sha256=KYHjDOps6_Q3TFV_Jj7MLj-L9Evx05AXELCvp06BARU,5857
20
20
  datamarket/utils/nominatim.py,sha256=IxexKY2KOlDhiKtzsqQfoVUjJXPxJl7tn3iHUaQKg08,5795
21
21
  datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  datamarket/utils/playwright/async_api.py,sha256=UbA2D4ScBtYeMfrRjly4RO-s8wXIub9c05J1eoOCpsQ,5782
23
- datamarket/utils/playwright/sync_api.py,sha256=Tw_-KLB3vipFuEQwcX8iCbj7giCzcwXB-bhl_ncR-2Q,5542
23
+ datamarket/utils/playwright/sync_api.py,sha256=eXaZsd7xgWSYJtZv6EAstjSbS2bl9OYlkwMBfqqTbFY,6434
24
24
  datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
25
25
  datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
26
26
  datamarket/utils/strings/__init__.py,sha256=b6TYOT9v7y9ID-lDyZk4E8BH2uIPbsF2ZSLGjCQ1MCQ,43
27
27
  datamarket/utils/strings/normalization.py,sha256=UBluU6ABY6aCpnd02F7L7HcivVSisRJ9IUXdj9D1MyE,9050
28
28
  datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnmH3FPlgUUjM,5246
29
- datamarket/utils/strings/standardization.py,sha256=xl4I6F3brDFdRWKy7jKnOIIEo2YsqcBcPa2p5TJGRC0,2236
29
+ datamarket/utils/strings/standardization.py,sha256=c8CAG6HI3AfK0hB3A3IGwsbnQebZ6R3PrA5PELHRXM0,1492
30
30
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
31
31
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
32
- datamarket-0.7.98.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
- datamarket-0.7.98.dist-info/METADATA,sha256=f8YsBdGrtjmUOroA-qvv1bEpnt1LSfrCxqt8z6kF_tk,7381
34
- datamarket-0.7.98.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
35
- datamarket-0.7.98.dist-info/RECORD,,
32
+ datamarket-0.7.100.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
+ datamarket-0.7.100.dist-info/METADATA,sha256=ZzGfCV51bIyPYJVdCSfJDdX8YuC9_BjKR1VCoRtd6yI,7382
34
+ datamarket-0.7.100.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
35
+ datamarket-0.7.100.dist-info/RECORD,,