PyPI - datamarket - Versions diffs - 0.7.98__tar.gz → 0.7.100__tar.gz - Mend

datamarket 0.7.98tar.gz → 0.7.100tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datamarket might be problematic. Click here for more details.

Files changed (36) hide show

{datamarket-0.7.98 → datamarket-0.7.100}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: datamarket
-Version: 0.7.98
+Version: 0.7.100
 Summary: Utilities that integrate advanced scraping knowledge into just one library.
 License: GPL-3.0-or-later
 Author: DataMarket

{datamarket-0.7.98 → datamarket-0.7.100}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "datamarket"
-version = "0.7.98"
+version = "0.7.100"
 description = "Utilities that integrate advanced scraping knowledge into just one library."
 authors = ["DataMarket <techsupport@datamarket.es>"]
 license = "GPL-3.0-or-later"

{datamarket-0.7.98 → datamarket-0.7.100}/src/datamarket/params/nominatim.py RENAMED Viewed

@@ -412,7 +412,7 @@ COUNTRY_PARSING_RULES = {
         "zip_search_pattern": re.compile(r"\b\d{5}\b"),
-        "phone_validate_pattern": re.compile(r"^(\+?34)?[6|7]\d{8}$")
+        "phone_validate_pattern": re.compile(r"^(\+?34)?[67]\d{8}$")
     },
     "pt": {
         "zip_validate_pattern": re.compile(r"^\d{4}[- ]{0,1}\d{3}$|^\d{4}$"),

{datamarket-0.7.98 → datamarket-0.7.100}/src/datamarket/utils/playwright/sync_api.py RENAMED Viewed

@@ -53,12 +53,14 @@ def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, add_
 class PlaywrightCrawler:
     """A robust, proxy-enabled Playwright crawler with captcha bypass and retry logic."""
-    def __init__(self, proxy_interface: ProxyInterface):
+    def __init__(self, proxy_interface: Optional[ProxyInterface] = None):
         """
-        Initializes the crawler with a proxy interface.
+        Initializes the crawler.
         Args:
-            proxy_interface (ProxyInterface): An object to fetch proxy credentials.
+            proxy_interface (Optional[ProxyInterface], optional): Provider used to fetch
+                proxy credentials. Defaults to None. When None, no proxy is configured and
+                the browser will run without a proxy.
         """
         self.proxy_interface = proxy_interface
         self.pw: Optional[Camoufox] = None
@@ -81,6 +83,25 @@ class PlaywrightCrawler:
         if self.pw:
             self.pw.__exit__(exc_type, exc_val, exc_tb)
+    def _build_proxy_config(self) -> Optional[dict]:
+        """Builds the proxy configuration dictionary.
+        Returns:
+            Optional[dict]: Proxy configuration if a proxy_interface is provided; otherwise None.
+        """
+        if not self.proxy_interface:
+            logger.info("Starting browser without proxy.")
+            return None
+        host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
+        proxy_url = f"http://{host}:{port}"
+        proxy_cfg: dict = {"server": proxy_url}
+        if user and pwd:
+            proxy_cfg.update({"username": user, "password": pwd})
+        logger.info(f"Starting browser with proxy: {proxy_url}")
+        return proxy_cfg
     @retry(
         wait=wait_exponential(exp_base=2, multiplier=3, max=90),
         stop=stop_after_delay(timedelta(minutes=10)),
@@ -88,16 +109,20 @@ class PlaywrightCrawler:
         reraise=True,
     )
     def init_context(self) -> Self:
-        """Initializes a new browser instance and context with a fresh proxy."""
-        try:
-            host, port, user, pwd = self.proxy_interface.get_proxies(raw=True, use_auth=True)
-            proxy_url = f"http://{host}:{port}"
-            proxy_cfg = {"server": proxy_url}
+        """
+        Initializes a new browser instance and context.
+        Behavior:
+        - If a proxy_interface is provided, fetches fresh proxy credentials and starts
+          the browser using that proxy.
+        - If proxy_interface is None, starts the browser without any proxy.
-            if user and pwd:
-                proxy_cfg.update({"username": user, "password": pwd})
+        Returns:
+            Self: The crawler instance with active browser, context, and page.
+        """
+        try:
+            proxy_cfg: Optional[dict] = self._build_proxy_config()
-            logger.info(f"Starting browser with proxy: {proxy_url}")
             self.pw = Camoufox(headless=True, geoip=True, humanize=True, proxy=proxy_cfg)
             self.browser = self.pw.__enter__()
             self.context = self.browser.new_context()
@@ -146,4 +171,4 @@ class PlaywrightCrawler:
         if not self.page:
             logger.info("Browser context not found, initializing now...")
             self.init_context()
-        return self._goto_with_retry(url)
+        return self._goto_with_retry(url)

datamarket-0.7.100/src/datamarket/utils/strings/standardization.py ADDED Viewed

@@ -0,0 +1,38 @@
+########################################################################################################################
+# IMPORTS
+import re
+from typing import Literal
+from ...params.nominatim import COUNTRY_PARSING_RULES
+########################################################################################################################
+# FUNCTIONS
+def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str | None:
+    """Clean and standardize phone number from a certain country_code
+    Args:
+        number (str): phone number
+        country_code (Literal["es", "pt"]): country code of the phone number to parse
+    Raises:
+        ValueError: when parsing is not supported for a certain country
+    Returns:
+        str | None: standardized phone number
+    """
+    clean_number = re.sub(r"\D", "", number)
+    if country_code in {"es", "pt"}:
+        # Get the validation regex from params
+        pattern = COUNTRY_PARSING_RULES[country_code]["phone_validate_pattern"]
+        # Validate and extract in one step
+        if len(clean_number) >= 9: # Check if the cleaned number has at least 9 digits
+            match = pattern.match(clean_number)
+            # Return the captured group (the 9-digit number)
+            return match.group(0)[-9:] if match else None
+        else:
+            return None # Or handle the case where the number is too short
+    else:
+        raise ValueError(f"Country code ({country_code}) is not currently supported")

datamarket-0.7.98/src/datamarket/utils/strings/standardization.py DELETED Viewed

@@ -1,69 +0,0 @@
-########################################################################################################################
-# IMPORTS
-import re
-from typing import Literal
-from ...params.nominatim import COUNTRY_PARSING_RULES
-########################################################################################################################
-# FUNCTIONS
-def _standardize_es_phone_number(number: str) -> str | None:
-    """Standardize phone numbers from Spain using regex validation.
-    Args:
-        number (str): cleaned, digits-only phone number
-    Returns:
-        str | None: standardized 9-digit phone number
-    """
-    # Get the validation regex from params
-    pattern = COUNTRY_PARSING_RULES["es"]["phone_validate_pattern"]
-    # Validate and extract in one step
-    match = pattern.match(number)
-    # Return the captured group (the 9-digit number)
-    return match.group(1) if match else None
-def _standardize_pt_phone_number(number: str) -> str | None:
-    """Standardize phone numbers from Portugal using regex validation.
-    Args:
-        number (str): cleaned, digits-only phone number
-    Returns:
-        str | None: standardized 9-digit phone number
-    """
-    # Get the validation regex from params
-    pattern = COUNTRY_PARSING_RULES["pt"]["phone_validate_pattern"]
-    # Validate and extract in one step
-    match = pattern.match(number)
-    # Return the captured group (the 9-digit number)
-    return match.group(1) if match else None
-def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str | None:
-    """Clean and standardize phone number from a certain country_code
-    Args:
-        number (str): phone number
-        country_code (Literal["es", "pt"]): country code of the phone number to parse
-    Raises:
-        ValueError: when parsing is not supported for a certain country
-    Returns:
-        str | None: standardized phone number
-    """
-    clean_number = re.sub(r"\D", "", number)
-    if country_code == "es":
-        return _standardize_es_phone_number(clean_number)
-    elif country_code == "pt":
-        return _standardize_pt_phone_number(clean_number)
-    else:
-        raise ValueError(f"Country code ({country_code}) is not currently supported")