PyPI - datamarket - Versions diffs - 0.9.24__tar.gz → 0.9.26__tar.gz - Mend

datamarket 0.9.24tar.gz → 0.9.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datamarket might be problematic. Click here for more details.

Files changed (25) hide show

{datamarket-0.9.24 → datamarket-0.9.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: datamarket
-Version: 0.9.24
+Version: 0.9.26
 Summary: Utilities that integrate advanced scraping knowledge into just one library.
 License: GPL-3.0-or-later
 Author: DataMarket
@@ -59,7 +59,6 @@ Provides-Extra: soda-core-postgres
 Provides-Extra: stem
 Provides-Extra: tqdm
 Provides-Extra: undetected-chromedriver
-Provides-Extra: unidecode
 Provides-Extra: xmltodict
 Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
 Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
@@ -84,10 +83,12 @@ Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-htt
 Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
 Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
 Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
+Requires-Dist: inflection (>=0.5.0,<0.6.0)
 Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
 Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
 Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
 Requires-Dist: nodriver (>=0.44,<0.45) ; extra == "nodriver"
+Requires-Dist: numpy (>=2.0.0,<3.0.0)
 Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
 Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
 Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
@@ -102,6 +103,7 @@ Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
 Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
 Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
 Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
+Requires-Dist: python-string-utils (>=1.0.0,<2.0.0)
 Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
 Requires-Dist: requests (>=2.0.0,<3.0.0)
 Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
@@ -112,7 +114,7 @@ Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
 Requires-Dist: tenacity (>=9.0.0,<10.0.0)
 Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
 Requires-Dist: typer (>=0.15.0,<0.16.0)
-Requires-Dist: unidecode (>=1.0.0,<2.0.0) ; extra == "unidecode"
+Requires-Dist: unidecode (>=1.0.0,<2.0.0)
 Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
 Project-URL: Documentation, https://github.com/Data-Market/datamarket
 Project-URL: Homepage, https://datamarket.es

{datamarket-0.9.24 → datamarket-0.9.26}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "datamarket"
-version = "0.9.24"
+version = "0.9.26"
 description = "Utilities that integrate advanced scraping knowledge into just one library."
 authors = ["DataMarket <techsupport@datamarket.es>"]
 license = "GPL-3.0-or-later"
@@ -26,9 +26,12 @@ pendulum = "^3.0.0"
 croniter = "^3.0.0"
 dynaconf = "^3.0.0"
 jinja2 = "^3.0.0"
+inflection = "~0.5.0"
+python-string-utils = "^1.0.0"
+unidecode = "^1.0.0"
+numpy = "^2.0.0"
 boto3 = { version = "~1.35.0", optional = true }
-unidecode = { version = "^1.0.0", optional = true }
 lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
 tqdm = { version = "^4.0.0", optional = true }
 pandas = { version = "^2.0.0", optional = true }
@@ -74,7 +77,6 @@ camoufox = { extras = ["geoip"], version = "~0.4.11", optional = true }
 [tool.poetry.extras]
 boto3 = ["boto3"]
-unidecode = ["unidecode"]
 lxml = ["lxml"]
 tqdm = ["tqdm"]
 pandas = ["pandas"]

{datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/alchemy.py RENAMED Viewed

@@ -11,6 +11,7 @@ from sqlalchemy.dialects.postgresql import insert
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.declarative import DeclarativeMeta
 from sqlalchemy.orm import Session, sessionmaker
+from sqlalchemy.sql.expression import ClauseElement
 from enum import Enum
 ########################################################################################################################
@@ -220,17 +221,43 @@ class AlchemyInterface:
         if not silent:
             logger.info(f"upserting {alchemy_obj}")
-        primary_keys = list(col.name for col in alchemy_obj.__table__.primary_key.columns.values())
-        obj_dict = {
+        table = alchemy_obj.__table__
+        primary_keys = list(col.name for col in table.primary_key.columns.values())
+        # Build the dictionary for the INSERT values
+        insert_values = {
+            col.name: getattr(alchemy_obj, col.name)
+            for col in table.columns
+            if getattr(alchemy_obj, col.name) is not None  # Include all non-None values for insert
+        }
+        # Build the dictionary for the UPDATE set clause
+        # Start with values from the object, excluding primary keys
+        update_set_values = {
             col.name: val
-            for col in alchemy_obj.__table__.columns
+            for col in table.columns
             if col.name not in primary_keys and (val := getattr(alchemy_obj, col.name)) is not None
         }
+        # Add columns with SQL-based onupdate values explicitly to the set clause
+        for column in table.columns:
+            actual_sql_expression = None
+            if column.onupdate is not None:
+                if hasattr(column.onupdate, "arg") and isinstance(column.onupdate.arg, ClauseElement):
+                    # This handles wrappers like ColumnElementColumnDefault,
+                    # where the actual SQL expression is in the .arg attribute.
+                    actual_sql_expression = column.onupdate.arg
+                elif isinstance(column.onupdate, ClauseElement):
+                    # This handles cases where onupdate might be a direct SQL expression.
+                    actual_sql_expression = column.onupdate
+            if actual_sql_expression is not None:
+                update_set_values[column.name] = actual_sql_expression
         statement = (
-            insert(alchemy_obj.__table__)
-            .values(obj_dict)
-            .on_conflict_do_update(index_elements=index_elements, set_=obj_dict)
+            insert(table)
+            .values(insert_values)
+            .on_conflict_do_update(index_elements=index_elements, set_=update_set_values)
         )
         try:

datamarket-0.9.26/src/datamarket/utils/strings.py ADDED Viewed

@@ -0,0 +1,130 @@
+########################################################################################################################
+# IMPORTS
+from enum import Enum, auto
+from typing import Any
+import unicodedata
+import numpy as np
+from unidecode import unidecode
+from inflection import parameterize, underscore, titleize, camelize
+from string_utils import prettify, strip_html
+########################################################################################################################
+# CLASSES
+class NormalizationMode(Enum):
+    NONE = auto()
+    BASIC = auto()  # removes accents and converts punctuation to spaces
+    SYMBOLS = auto()  # translates only symbols to Unicode name
+    FULL = auto()  # BASIC + SYMBOLS
+class NamingConvention(Enum):
+    NONE = auto()  # no style change
+    CONSTANT = auto()  # CONSTANT_CASE (uppercase, underscores)
+    SNAKE = auto()  # snake_case (lowercase, underscores)
+    CAMEL = auto()  # camelCase (capitalize words except first one, no spaces)
+    PASCAL = auto()  # PascalCase (capitalize words including first one, no spaces)
+    PARAM = auto()  # parameterize (hyphens)
+    TITLE = auto()  # titleize (capitalize words)
+########################################################################################################################
+# FUNCTIONS
+def transliterate_symbols(s: str) -> str:
+    """
+    Translates symbols (category S*) to lowercase Unicode names,
+    with spaces→underscores. The rest of the text remains the same.
+    """
+    out: list[str] = []
+    for c in s:
+        if unicodedata.category(c).startswith("S"):
+            name = unicodedata.name(c, "")
+            if name:
+                out.append(name.lower().replace(" ", "_"))
+        else:
+            out.append(c)
+    return "".join(out)
+def normalize(
+    s: Any, mode: NormalizationMode = NormalizationMode.BASIC, naming: NamingConvention = NamingConvention.NONE
+) -> str:
+    """
+    1. Normalizes the string according to `mode`:
+       - NONE: returns the original input as an unprocessed string.
+       - BASIC: removes accents, converts punctuation to spaces, preserves alphanumeric characters.
+       - SYMBOLS: translates only symbols to Unicode name.
+       - FULL: combines BASIC + SYMBOLS.
+    2. Applies naming convention according to `naming`:
+       - NONE: returns the normalized text.
+       - PARAM: parameterize (hyphens).
+       - SNAKE: snake_case (underscore, lowercase).
+       - CONSTANT: CONSTANT_CASE (underscore, uppercase).
+    """
+    # Parameter mapping
+    if isinstance(mode, str):
+        mode = NormalizationMode[mode]
+    if isinstance(naming, str):
+        naming = NamingConvention[naming]
+    # Handling null values
+    if s is None or (isinstance(s, float) and np.isnan(s)):
+        normalized = ""
+    elif not isinstance(s, str):
+        return str(s)
+    else:
+        text = prettify(strip_html(str(s), True))
+        if mode is NormalizationMode.NONE:
+            normalized = text
+        elif mode is NormalizationMode.SYMBOLS:
+            normalized = transliterate_symbols(text)
+        else:
+            # BASIC and FULL: remove accents and lowercase
+            normalized = unidecode(text).lower()
+            tokens: list[str] = []
+            current: list[str] = []
+            def flush_current():
+                nonlocal current
+                if current:
+                    tokens.append("".join(current))
+                    current.clear()
+            for c in normalized:
+                cat = unicodedata.category(c)
+                if c.isalnum():
+                    current.append(c)
+                elif mode is NormalizationMode.FULL and cat.startswith("S"):
+                    flush_current()
+                    name = unicodedata.name(c, "")
+                    if name:
+                        tokens.append(name.lower().replace(" ", "_"))
+                elif cat.startswith("P") or c.isspace():
+                    flush_current()
+                # other characters ignored
+            flush_current()
+            normalized = " ".join(tokens)
+    # Apply naming convention
+    if naming is NamingConvention.NONE:
+        return normalized
+    if naming is NamingConvention.PARAM:
+        return parameterize(normalized)
+    if naming is NamingConvention.TITLE:
+        return titleize(normalized)
+    underscored = underscore(parameterize(normalized))
+    if naming is NamingConvention.CONSTANT:
+        return underscored.upper()
+    if naming is NamingConvention.CAMEL:
+        return camelize(underscored, False)
+    if naming is NamingConvention.PASCAL:
+        return camelize(underscored)
+    return underscored