PyPI - datamarket - Versions diffs - 0.7.97__tar.gz → 0.7.98__tar.gz - Mend

datamarket 0.7.97tar.gz → 0.7.98tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datamarket might be problematic. Click here for more details.

Files changed (37) hide show

{datamarket-0.7.97 → datamarket-0.7.98}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: datamarket
-Version: 0.7.97
+Version: 0.7.98
 Summary: Utilities that integrate advanced scraping knowledge into just one library.
 License: GPL-3.0-or-later
 Author: DataMarket

{datamarket-0.7.97 → datamarket-0.7.98}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "datamarket"
-version = "0.7.97"
+version = "0.7.98"
 description = "Utilities that integrate advanced scraping knowledge into just one library."
 authors = ["DataMarket <techsupport@datamarket.es>"]
 license = "GPL-3.0-or-later"

{datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/interfaces/alchemy.py RENAMED Viewed

@@ -215,7 +215,10 @@ class AlchemyInterface:
         label = PG_ERROR_LABELS.get(code, "Integrity error (unspecified)")
         # Log one clean message with trace + the raw DB message separately
-        logger.error(f"{label} trying to {action} {alchemy_obj}\nPostgreSQL message: {ex.orig}")
+        if code == "23505": # A simple info log for unique violations
+            logger.info(f"{label} trying to {action} {alchemy_obj}")
+        else:
+            logger.error(f"{label} trying to {action} {alchemy_obj}\nPostgreSQL message: {ex.orig}")
     def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> bool:

datamarket-0.7.98/src/datamarket/params/nominatim.py ADDED Viewed

@@ -0,0 +1,427 @@
+from unidecode import unidecode
+import re
+CITY_TO_PROVINCE = {"Madrid": "Madrid"}
+POSTCODES = {
+    "01": "Álava",
+    "02": "Albacete",
+    "03": "Alicante",
+    "04": "Almería",
+    "05": "Ávila",
+    "06": "Badajoz",
+    "07": "Baleares",
+    "08": "Barcelona",
+    "09": "Burgos",
+    "10": "Cáceres",
+    "11": "Cádiz",
+    "12": "Castellón",
+    "13": "Ciudad Real",
+    "14": "Córdoba",
+    "15": "La Coruña",
+    "16": "Cuenca",
+    "17": "Gerona",
+    "18": "Granada",
+    "19": "Guadalajara",
+    "20": "Guipúzcoa",
+    "21": "Huelva",
+    "22": "Huesca",
+    "23": "Jaén",
+    "24": "León",
+    "25": "Lérida",
+    "26": "La Rioja",
+    "27": "Lugo",
+    "28": "Madrid",
+    "29": "Málaga",
+    "30": "Murcia",
+    "31": "Navarra",
+    "32": "Orense",
+    "33": "Asturias",
+    "34": "Palencia",
+    "35": "Las Palmas",
+    "36": "Pontevedra",
+    "37": "Salamanca",
+    "38": "Santa Cruz de Tenerife",
+    "39": "Cantabria",
+    "40": "Segovia",
+    "41": "Sevilla",
+    "42": "Soria",
+    "43": "Tarragona",
+    "44": "Teruel",
+    "45": "Toledo",
+    "46": "Valencia",
+    "47": "Valladolid",
+    "48": "Vizcaya",
+    "49": "Zamora",
+    "50": "Zaragoza",
+    "51": "Ceuta",
+    "52": "Melilla",
+}
+# Mapping of normalized names (for comparison) to standardized names (for storing)
+# for each corresponding country code
+STATES = {
+    "es": {
+        "andalucia": "Andalucía",
+        "aragon": "Aragón",
+        "asturias": "Asturias",
+        "baleares": "Baleares",
+        "canarias": "Canarias",
+        "cantabria": "Cantabria",
+        "castilla la mancha": "Castilla-La Mancha",
+        "castilla y leon": "Castilla y León",
+        "cataluna": "Cataluña",
+        "ceuta": "Ceuta",
+        "comunidad valenciana": "Comunidad Valenciana",
+        "extremadura": "Extremadura",
+        "galicia": "Galicia",
+        "la rioja": "La Rioja",
+        "madrid": "Comunidad de Madrid",
+        "melilla": "Melilla",
+        "murcia": "Murcia",
+        "navarra": "Navarra",
+        "pais vasco": "País Vasco",
+        "euskadi": "País Vasco",  # Alias not caught by rapidfuzz
+    }
+}
+PROVINCES = {
+    "es": {
+        "alava": "Álava",
+        "araba": "Álava",  # Alias not caught by rapidfuzz
+        "albacete": "Albacete",
+        "alicante": "Alicante",
+        "almeria": "Almería",
+        "asturias": "Asturias",
+        "avila": "Ávila",
+        "badajoz": "Badajoz",
+        "barcelona": "Barcelona",
+        "bizkaia": "Vizcaya",
+        "burgos": "Burgos",
+        "caceres": "Cáceres",
+        "cadiz": "Cádiz",
+        "cantabria": "Cantabria",
+        "castellon": "Castellón",
+        "ceuta": "Ceuta",  # Considered province by opensm and/or geonames
+        "ciudad real": "Ciudad Real",
+        "cordoba": "Córdoba",
+        "cuenca": "Cuenca",
+        "gipuzkoa": "Gipuzkoa",
+        "gerona": "Gerona",
+        "granada": "Granada",
+        "guadalajara": "Guadalajara",
+        "huelva": "Huelva",
+        "huesca": "Huesca",
+        "islas baleares": "Islas Baleares",
+        "jaen": "Jaén",
+        "la coruna": "La Coruña",
+        "la rioja": "La Rioja",
+        "las palmas": "Las Palmas",
+        "leon": "León",
+        "lerida": "Lérida",
+        "lugo": "Lugo",
+        "madrid": "Madrid",
+        "malaga": "Málaga",
+        "melilla": "Melilla",  # Considered province by opensm and/or geonames
+        "murcia": "Murcia",
+        "navarra": "Navarra",
+        "orense": "Orense",
+        "palencia": "Palencia",
+        "pontevedra": "Pontevedra",
+        "salamanca": "Salamanca",
+        "santa cruz de tenerife": "Santa Cruz de Tenerife",
+        "segovia": "Segovia",
+        "sevilla": "Sevilla",
+        "soria": "Soria",
+        "tarragona": "Tarragona",
+        "teruel": "Teruel",
+        "toledo": "Toledo",
+        "valencia": "Valencia",
+        "valladolid": "Valladolid",
+        "zamora": "Zamora",
+        "zaragoza": "Zaragoza",
+    }
+}
+PROVINCE_TO_POSTCODE = {
+    "es": {
+        "A Coruña": "15",
+        "Álava": "01",
+        "Araba": "01",
+        "Alacant": "03",
+        "Alicante": "03",
+        "Albacete": "02",
+        "Almería": "04",
+        "Asturias": "33",
+        "Ávila": "05",
+        "Badajoz": "06",
+        "Baleares": "07",
+        "Barcelona": "08",
+        "Bizkaia": "48",
+        "Burgos": "09",
+        "Cáceres": "10",
+        "Cádiz": "11",
+        "Cantabria": "39",
+        "Castelló": "12",
+        "Castellón": "12",
+        "Ceuta": "51",
+        "Ciudad Real": "13",
+        "Córdoba": "14",
+        "Cuenca": "16",
+        "Gerona": "17",
+        "Gipuzkoa": "20",
+        "Girona": "17",
+        "Granada": "18",
+        "Guadalajara": "19",
+        "Guipúzcoa": "20",
+        "Huelva": "21",
+        "Huesca": "22",
+        "Illes Balears": "07",
+        "Jaén": "23",
+        "La Coruña": "15",
+        "La Rioja": "26",
+        "Las Palmas": "35",
+        "León": "24",
+        "Lérida": "25",
+        "Lleida": "25",
+        "Lugo": "27",
+        "Madrid": "28",
+        "Málaga": "29",
+        "Melilla": "52",
+        "Murcia": "30",
+        "Navarra": "31",
+        "Orense": "32",
+        "Ourense": "32",
+        "Palencia": "34",
+        "Pontevedra": "36",
+        "Salamanca": "37",
+        "Santa Cruz de Tenerife": "38",
+        "Segovia": "40",
+        "Sevilla": "41",
+        "Soria": "42",
+        "Tarragona": "43",
+        "Teruel": "44",
+        "Toledo": "45",
+        "València": "46",
+        "Valencia": "46",
+        "Valladolid": "47",
+        "Vizcaya": "48",
+        "Zamora": "49",
+        "Zaragoza": "50",
+    },
+    "pt": {
+        "Aveiro": "3",
+        "Beja": "7",
+        "Braga": "4",
+        "Bragança": "5",
+        "Castelo Branco": "6",
+        "Coimbra": "3",
+        "Évora": "7",
+        "Faro": "8",
+        "Guarda": "6",
+        "Leiria": "2",
+        "Lisboa": "1",
+        "Portalegre": "7",
+        "Porto": "4",
+        "Santarém": "2",
+        "Setúbal": "2",
+        "Viana do Castelo": "4",
+        "Vila Real": "5",
+        "Viseu": "3",
+        "Açores": "9",
+        "Madeira": "9",
+    },
+}
+POSTCODE_TO_STATES = {
+    "es": {
+        # Andalucía
+        "04": "Andalucía",
+        "11": "Andalucía",
+        "14": "Andalucía",
+        "18": "Andalucía",
+        "21": "Andalucía",
+        "23": "Andalucía",
+        "29": "Andalucía",
+        "41": "Andalucía",
+        # Aragón
+        "22": "Aragón",
+        "44": "Aragón",
+        "50": "Aragón",
+        # Asturias
+        "33": "Principado de Asturias",
+        # Baleares
+        "07": "Islas Baleares",
+        # Canarias
+        "35": "Canarias",
+        "38": "Canarias",
+        # Cantabria
+        "39": "Cantabria",
+        # Castilla y León
+        "05": "Castilla y León",
+        "09": "Castilla y León",
+        "24": "Castilla y León",
+        "34": "Castilla y León",
+        "37": "Castilla y León",
+        "40": "Castilla y León",
+        "42": "Castilla y León",
+        "47": "Castilla y León",
+        "49": "Castilla y León",
+        # Castilla-La Mancha
+        "02": "Castilla-La Mancha",
+        "13": "Castilla-La Mancha",
+        "16": "Castilla-La Mancha",
+        "19": "Castilla-La Mancha",
+        "45": "Castilla-La Mancha",
+        # Cataluña
+        "08": "Cataluña",
+        "17": "Cataluña",
+        "25": "Cataluña",
+        "43": "Cataluña",
+        # Comunidad Valenciana
+        "03": "Comunidad Valenciana",
+        "12": "Comunidad Valenciana",
+        "46": "Comunidad Valenciana",
+        # Extremadura
+        "06": "Extremadura",
+        "10": "Extremadura",
+        # Galicia
+        "15": "Galicia",
+        "27": "Galicia",
+        "32": "Galicia",
+        "36": "Galicia",
+        # Madrid
+        "28": "Comunidad de Madrid",
+        # Murcia
+        "30": "Región de Murcia",
+        # Navarra
+        "31": "Comunidad Foral de Navarra",
+        # País Vasco
+        "01": "País Vasco",
+        "20": "País Vasco",
+        "48": "País Vasco",
+        # La Rioja
+        "26": "La Rioja",
+        # Ciudades Autónomas
+        "51": "Ceuta",
+        "52": "Melilla",
+    },
+    "pt": {  # --- NORTE ---
+        "40": "Porto",
+        "41": "Porto",
+        "42": "Porto",
+        "43": "Porto",
+        "44": "Porto",
+        "45": "Aveiro",  # Concelhos do norte de Aveiro, na fronteira com Porto.
+        "47": "Braga",
+        "48": "Braga",  # Guimarães.
+        "49": "Viana do Castelo",
+        "50": "Vila Real",
+        "51": "Vila Real",
+        "52": "Vila Real",
+        "53": "Vila Real / Bragança",  # Zona fronteiriça.
+        "54": "Bragança",
+        # --- CENTRO ---
+        "60": "Castelo Branco",
+        "61": "Castelo Branco",
+        "62": "Castelo Branco",
+        "63": "Guarda",
+        "30": "Coimbra",
+        "31": "Coimbra",
+        "32": "Coimbra",
+        "33": "Coimbra",
+        "34": "Viseu",
+        "35": "Viseu",
+        "37": "Aveiro",
+        "38": "Aveiro",
+        "24": "Leiria",
+        # --- ÁREA METROPOLITANA DE LISBOA e arredores ---
+        "10": "Lisboa",
+        "11": "Lisboa",
+        "12": "Lisboa",
+        "13": "Lisboa",
+        "14": "Lisboa",
+        "15": "Lisboa",
+        "16": "Lisboa",
+        "17": "Lisboa",
+        "18": "Lisboa",
+        "19": "Lisboa",
+        "20": "Santarém",
+        "21": "Santarém",
+        "22": "Santarém",
+        "23": "Santarém",  # Tomar e Torres Novas.
+        "25": "Lisboa",  # Concelhos como Torres Vedras, Mafra, Alenquer.
+        "26": "Lisboa",  # Concelhos como Loures, Amadora, Odivelas.
+        "27": "Lisboa",  # Concelhos como Sintra, Cascais, Oeiras.
+        "28": "Setúbal",
+        "29": "Setúbal",
+        # --- ALENTEJO ---
+        "70": "Évora",
+        "71": "Évora",
+        "72": "Évora",
+        "73": "Portalegre",
+        "74": "Portalegre",
+        "75": "Setúbal",  # Litoral Alentejano (Sines, Grândola), administrativamente de Setúbal.
+        "76": "Beja",
+        "77": "Beja",
+        "78": "Beja",
+        "79": "Beja",
+        # --- ALGARVE ---
+        "80": "Faro",
+        "81": "Faro",
+        "82": "Faro",
+        "83": "Faro",
+        "84": "Faro",
+        "85": "Faro",
+        "86": "Faro",
+        "87": "Faro",
+        "88": "Faro",
+        "89": "Faro",
+        # --- REGIÕES AUTÓNOMAS ---
+        "90": "Madeira",
+        "91": "Madeira",
+        "92": "Madeira",
+        "93": "Madeira",
+        "95": "Açores",  # Ilha de São Miguel (Ponta Delgada).
+        "96": "Açores",  # Ilha de São Miguel (Ribeira Grande) e Santa Maria.
+        "97": "Açores",  # Ilha Terceira (Angra do Heroísmo).
+        "98": "Açores",  # Ilhas de São Jorge, Graciosa, Faial, Pico.
+        "99": "Açores",  # Ilhas de Flores e Corvo.
+    },
+}
+_NORMALIZED_PROVINCE_CACHE = {}
+for country, provinces in PROVINCE_TO_POSTCODE.items():
+    # Get the original keys (e.g., "A Coruña", "Álava")
+    original_keys = list(provinces.keys())
+    # Create the normalized list (e.g., "a coruna", "alava")
+    normalized_choices = [unidecode(p).lower() for p in original_keys]
+    _NORMALIZED_PROVINCE_CACHE[country] = {
+        "choices": normalized_choices, # The list for rapidfuzz to search in
+        "keys": original_keys          # The list to find the name by index
+    }
+# Source: https://github.com/ariankoochak/regex-patterns-of-all-countries
+COUNTRY_PARSING_RULES = {
+    "es": {
+        "zip_validate_pattern": re.compile(r"^\d{5}$"),
+        "zip_search_pattern": re.compile(r"\b\d{5}\b"),
+        "phone_validate_pattern": re.compile(r"^(\+?34)?[6|7]\d{8}$")
+    },
+    "pt": {
+        "zip_validate_pattern": re.compile(r"^\d{4}[- ]{0,1}\d{3}$|^\d{4}$"),
+        "zip_search_pattern": re.compile(r"\b\d{4}[- ]?\d{3}\b|\b\d{4}\b"),
+        "phone_validate_pattern": re.compile(r"^(\+?351)?9[1236]\d{7}$")
+    }
+}
+# Cutoff score for rapidfuzz in the name standardization function
+STANDARD_THRESHOLD = 40

datamarket-0.7.98/src/datamarket/utils/nominatim.py ADDED Viewed

@@ -0,0 +1,204 @@
+########################################################################################################################
+# IMPORTS
+from typing import Literal, Optional
+from rapidfuzz import fuzz, process
+from unidecode import unidecode
+from ..params.nominatim import (
+    POSTCODE_TO_STATES,
+    PROVINCE_TO_POSTCODE,
+    PROVINCES,
+    STANDARD_THRESHOLD,
+    STATES,
+    _NORMALIZED_PROVINCE_CACHE,
+    COUNTRY_PARSING_RULES
+)
+from .strings import normalize
+########################################################################################################################
+# FUNCTIONS
+def standardize_admin_division(
+    name: str,
+    level: Literal["province", "state"] = "province",
+    country_code: str = "es",
+) -> Optional[str]:
+    """
+    Normalize and standardize administrative divisions of a given country using RapidFuzz.
+    Uses normalized dict keys for comparison and returns dict values with the official names.
+    """
+    if not name:
+        return None
+    country_code = country_code.lower()
+    mapping = (
+        STATES.get(country_code) if level == "state" else PROVINCES.get(country_code)
+    )
+    if not mapping:  # If country is not standardized, return raw name
+        return name
+    normalized_name = normalize(name)  # Essential for rapidfuzz to work well
+    result = process.extractOne(
+        normalized_name,
+        mapping.keys(),  # Compare with the normalized names in the dict
+        scorer=fuzz.WRatio,
+        score_cutoff=STANDARD_THRESHOLD,
+    )
+    if not result:
+        return None
+    best_key, score, _ = result
+    # Return the standardized name corresponding to the normalized name
+    return mapping[best_key]
+def parse_state(
+    zip_code: str,
+    country_code: str,
+) -> str | None:
+    """Given a zip code and a country code, returns the state in which the zip code is located
+    Args:
+        zip_code (str)
+        country_code (str)
+    Returns:
+        str | None: state if coincidence found, else None
+    """
+    country_postcodes = POSTCODE_TO_STATES.get(country_code, {})
+    state = country_postcodes.get(zip_code[:2], None)
+    return state
+def _province_postcode_match(
+    address: str,
+    zip_code: str,
+    country_code: str,
+) -> str | None:
+    """
+    Match and return province with the start of all of its zip codes
+    using a pre-computed cache and rapidfuzz for efficient matching.
+    Args:
+        address (str)
+        zip_code (str)
+        country_code (str)
+    Returns:
+        str | None:
+    """
+    # Get the pre-computed cache for the country
+    cache = _NORMALIZED_PROVINCE_CACHE.get(country_code)
+    if not cache:
+        return None  # Country not configured
+    normalized_address = unidecode(address).lower()
+    # Use the cached 'choices' list for the search
+    result = process.extractOne(
+        normalized_address,
+        cache["choices"],  # <-- Uses pre-computed list
+        scorer=fuzz.partial_ratio,
+        score_cutoff=100
+    )
+    if not result:
+        return None  # No exact substring match found
+    # We only need the index from the result
+    _, _, index = result
+    # Get the original province name from the cached 'keys' list
+    original_province = cache["keys"][index]  # <-- Uses pre-computed list
+    # Get the postcode prefix from the original map
+    province_map = PROVINCE_TO_POSTCODE.get(country_code, {})
+    postcode_prefix = province_map[original_province]
+    return (
+        postcode_prefix + zip_code[1:]
+        if len(zip_code) == 4
+        else zip_code
+    )
+def _parse_es_zip_code(
+    zip_code: str,
+    address: str,
+    opt_address: str | None,
+) -> str:
+    """parse spain zip code"""
+    # Get the validation regex from params
+    validate_regex = COUNTRY_PARSING_RULES['es']['zip_validate_pattern']
+    if validate_regex.match(zip_code):
+        return zip_code
+    else:
+        # Use search regex from params
+        pattern = COUNTRY_PARSING_RULES['es']['zip_search_pattern']
+        match = pattern.search(address)
+        if match:
+            return match.group()
+        if opt_address:
+            match = pattern.search(opt_address)
+            if match:
+                return match.group()
+        province_match = _province_postcode_match(address, zip_code, country_code="es")
+        return province_match or zip_code
+def _parse_pt_zip_code(
+    zip_code: str,
+    address: str,
+    opt_address: str | None,
+) -> str:
+    """parse portugal zip code"""
+    # Get the validation regex from params
+    validate_regex = COUNTRY_PARSING_RULES['pt']['zip_validate_pattern']
+    if validate_regex.match(zip_code):
+        return zip_code
+    else:
+        # Use search regex from params
+        pattern = COUNTRY_PARSING_RULES['pt']['zip_search_pattern']
+        match = pattern.search(address)
+        if match is None and opt_address:
+            match = pattern.search(opt_address)
+        return match.group() if match else zip_code
+def parse_zip_code(
+    address: str,
+    zip_code: str,
+    country_code: str,
+    opt_address: str | None = None,
+) -> str | None:
+    """Parse and standardize zip code
+    Args:
+        address (str): written address
+        zip_code (str)
+        country_code (str):
+        opt_address (str | None, optional): optional extra address, usually None. Defaults to None.
+    Raises:
+        ValueError: when parsing zip code is not supported for the passed country_code
+    Returns:
+        str | None
+    """
+    if country_code == "es":
+        return _parse_es_zip_code(zip_code, address, opt_address)
+    elif country_code == "pt":
+        return _parse_pt_zip_code(zip_code, address, opt_address)
+    else:
+        raise ValueError(f"Country code ({country_code}) is not currently supported")

{datamarket-0.7.97 → datamarket-0.7.98}/src/datamarket/utils/strings/normalization.py RENAMED Viewed

@@ -1,10 +1,9 @@
 ########################################################################################################################
 # IMPORTS
+import re
 import unicodedata
 from enum import Enum, auto
 from typing import Any, Optional, Set, Union
 import numpy as np
 from inflection import camelize, parameterize, titleize, underscore
 from string_utils import prettify, strip_html
@@ -37,7 +36,9 @@ class NamingConvention(Enum):
 # FUNCTIONS
-def get_unidecoded_text(input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False) -> str:
+def get_unidecoded_text(
+    input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False
+) -> str:
     """
     Processes a string by unidecoding characters, optionally lowercasing them,
     while preserving a specified set of allowed characters.
@@ -64,7 +65,9 @@ def get_unidecoded_text(input_text: str, allowed_chars: Set[str], apply_lowercas
     return "".join(chars_list)
-def transliterate_symbols(s: str, allowed_symbols_set: Optional[Set[str]] = None) -> str:
+def transliterate_symbols(
+    s: str, allowed_symbols_set: Optional[Set[str]] = None
+) -> str:
     """
     Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
     with spaces replaced by underscores. Other characters, or characters in allowed_symbols_set, remain unchanged.
@@ -179,7 +182,9 @@ def normalize(
             for c in intermediate_text:
                 cat = unicodedata.category(c)
-                if c in _allowed_symbols_set or c.isalnum():  # Allowed symbols are part of tokens
+                if (
+                    c in _allowed_symbols_set or c.isalnum()
+                ):  # Allowed symbols are part of tokens
                     current_token_chars.append(c)
                 elif mode is NormalizationMode.FULL and cat.startswith("S"):
                     # Transliterate S* category symbols not in allowed_symbols

datamarket-0.7.98/src/datamarket/utils/strings/standardization.py ADDED Viewed

@@ -0,0 +1,69 @@
+########################################################################################################################
+# IMPORTS
+import re
+from typing import Literal
+from ...params.nominatim import COUNTRY_PARSING_RULES
+########################################################################################################################
+# FUNCTIONS
+def _standardize_es_phone_number(number: str) -> str | None:
+    """Standardize phone numbers from Spain using regex validation.
+    Args:
+        number (str): cleaned, digits-only phone number
+    Returns:
+        str | None: standardized 9-digit phone number
+    """
+    # Get the validation regex from params
+    pattern = COUNTRY_PARSING_RULES["es"]["phone_validate_pattern"]
+    # Validate and extract in one step
+    match = pattern.match(number)
+    # Return the captured group (the 9-digit number)
+    return match.group(1) if match else None
+def _standardize_pt_phone_number(number: str) -> str | None:
+    """Standardize phone numbers from Portugal using regex validation.
+    Args:
+        number (str): cleaned, digits-only phone number
+    Returns:
+        str | None: standardized 9-digit phone number
+    """
+    # Get the validation regex from params
+    pattern = COUNTRY_PARSING_RULES["pt"]["phone_validate_pattern"]
+    # Validate and extract in one step
+    match = pattern.match(number)
+    # Return the captured group (the 9-digit number)
+    return match.group(1) if match else None
+def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str | None:
+    """Clean and standardize phone number from a certain country_code
+    Args:
+        number (str): phone number
+        country_code (Literal["es", "pt"]): country code of the phone number to parse
+    Raises:
+        ValueError: when parsing is not supported for a certain country
+    Returns:
+        str | None: standardized phone number
+    """
+    clean_number = re.sub(r"\D", "", number)
+    if country_code == "es":
+        return _standardize_es_phone_number(clean_number)
+    elif country_code == "pt":
+        return _standardize_pt_phone_number(clean_number)
+    else:
+        raise ValueError(f"Country code ({country_code}) is not currently supported")

datamarket-0.7.97/src/datamarket/params/nominatim.py DELETED Viewed

@@ -1,144 +0,0 @@
-CITY_TO_PROVINCE = {"Madrid": "Madrid"}
-POSTCODES = {
-    "01": "Álava",
-    "02": "Albacete",
-    "03": "Alicante",
-    "04": "Almería",
-    "05": "Ávila",
-    "06": "Badajoz",
-    "07": "Baleares",
-    "08": "Barcelona",
-    "09": "Burgos",
-    "10": "Cáceres",
-    "11": "Cádiz",
-    "12": "Castellón",
-    "13": "Ciudad Real",
-    "14": "Córdoba",
-    "15": "La Coruña",
-    "16": "Cuenca",
-    "17": "Gerona",
-    "18": "Granada",
-    "19": "Guadalajara",
-    "20": "Guipúzcoa",
-    "21": "Huelva",
-    "22": "Huesca",
-    "23": "Jaén",
-    "24": "León",
-    "25": "Lérida",
-    "26": "La Rioja",
-    "27": "Lugo",
-    "28": "Madrid",
-    "29": "Málaga",
-    "30": "Murcia",
-    "31": "Navarra",
-    "32": "Orense",
-    "33": "Asturias",
-    "34": "Palencia",
-    "35": "Las Palmas",
-    "36": "Pontevedra",
-    "37": "Salamanca",
-    "38": "Santa Cruz de Tenerife",
-    "39": "Cantabria",
-    "40": "Segovia",
-    "41": "Sevilla",
-    "42": "Soria",
-    "43": "Tarragona",
-    "44": "Teruel",
-    "45": "Toledo",
-    "46": "Valencia",
-    "47": "Valladolid",
-    "48": "Vizcaya",
-    "49": "Zamora",
-    "50": "Zaragoza",
-    "51": "Ceuta",
-    "52": "Melilla",
-}
-# Mapping of normalized names (for comparison) to standardized names (for storing)
-# for each corresponding country code
-STATES = {
-    "es": {
-        "andalucia": "Andalucía",
-        "aragon": "Aragón",
-        "asturias": "Asturias",
-        "baleares": "Baleares",
-        "canarias": "Canarias",
-        "cantabria": "Cantabria",
-        "castilla la mancha": "Castilla-La Mancha",
-        "castilla y leon": "Castilla y León",
-        "cataluna": "Cataluña",
-        "ceuta": "Ceuta",
-        "comunidad valenciana": "Comunidad Valenciana",
-        "extremadura": "Extremadura",
-        "galicia": "Galicia",
-        "la rioja": "La Rioja",
-        "madrid": "Comunidad de Madrid",
-        "melilla": "Melilla",
-        "murcia": "Murcia",
-        "navarra": "Navarra",
-        "pais vasco": "País Vasco",
-        "euskadi": "País Vasco",    # Alias not caught by rapidfuzz
-    }
-}
-PROVINCES = {
-    "es": {
-        "alava": "Álava",
-        "araba": "Álava", # Alias not caught by rapidfuzz
-        "albacete": "Albacete",
-        "alicante": "Alicante",
-        "almeria": "Almería",
-        "asturias": "Asturias",
-        "avila": "Ávila",
-        "badajoz": "Badajoz",
-        "barcelona": "Barcelona",
-        "bizkaia": "Vizcaya",
-        "burgos": "Burgos",
-        "caceres": "Cáceres",
-        "cadiz": "Cádiz",
-        "cantabria": "Cantabria",
-        "castellon": "Castellón",
-        "ceuta": "Ceuta", # Considered province by opensm and/or geonames
-        "ciudad real": "Ciudad Real",
-        "cordoba": "Córdoba",
-        "cuenca": "Cuenca",
-        "gipuzkoa": "Gipuzkoa",
-        "gerona": "Gerona",
-        "granada": "Granada",
-        "guadalajara": "Guadalajara",
-        "huelva": "Huelva",
-        "huesca": "Huesca",
-        "islas baleares": "Islas Baleares",
-        "jaen": "Jaén",
-        "la coruna": "La Coruña",
-        "la rioja": "La Rioja",
-        "las palmas": "Las Palmas",
-        "leon": "León",
-        "lerida": "Lérida",
-        "lugo": "Lugo",
-        "madrid": "Madrid",
-        "malaga": "Málaga",
-        "melilla": "Melilla", # Considered province by opensm and/or geonames
-        "murcia": "Murcia",
-        "navarra": "Navarra",
-        "orense": "Orense",
-        "palencia": "Palencia",
-        "pontevedra": "Pontevedra",
-        "salamanca": "Salamanca",
-        "santa cruz de tenerife": "Santa Cruz de Tenerife",
-        "segovia": "Segovia",
-        "sevilla": "Sevilla",
-        "soria": "Soria",
-        "tarragona": "Tarragona",
-        "teruel": "Teruel",
-        "toledo": "Toledo",
-        "valencia": "Valencia",
-        "valladolid": "Valladolid",
-        "zamora": "Zamora",
-        "zaragoza": "Zaragoza",
-    }
-}
-# Cutoff score for rapidfuzz in the name standardization function
-STANDARD_THRESHOLD = 40

datamarket-0.7.97/src/datamarket/utils/nominatim.py DELETED Viewed

@@ -1,38 +0,0 @@
-from typing import Optional, Literal
-from rapidfuzz import fuzz, process
-from ..params.nominatim import STATES, PROVINCES, STANDARD_THRESHOLD
-from .strings import normalize
-def standardize_admin_division(
-    name: str,
-    level: Literal["province", "state"] = "province",
-    country_code: str = "es"
-) -> Optional[str]:
-    """
-    Normalize and standardize administrative divisions of a given country using RapidFuzz.
-    Uses normalized dict keys for comparison and returns dict values with the official names.
-    """
-    if not name:
-        return None
-    country_code = country_code.lower()
-    mapping = STATES.get(country_code) if level == "state" else PROVINCES.get(country_code)
-    if not mapping: # If country is not standardized, return raw name
-        return name
-    normalized_name = normalize(name) # Essential for rapidfuzz to work well
-    result = process.extractOne(
-        normalized_name,
-        mapping.keys(), # Compare with the normalized names in the dict
-        scorer=fuzz.WRatio,
-        score_cutoff=STANDARD_THRESHOLD,
-    )
-    if not result:
-        return None
-    best_key, score, _ = result
-    # Return the standardized name corresponding to the normalized name
-    return mapping[best_key]