PyPI - meerschaum - Versions diffs - 2.6.17__py3-none-any.whl → 2.7.0rc1__py3-none-any.whl - Mend

meerschaum 2.6.17py3-none-any.whl → 2.7.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

meerschaum/actions/delete.py +65 -69
meerschaum/actions/install.py +1 -2
meerschaum/config/_default.py +1 -1
meerschaum/config/_paths.py +2 -1
meerschaum/config/_version.py +1 -1
meerschaum/connectors/api/_pipes.py +4 -3
meerschaum/connectors/sql/_pipes.py +63 -25
meerschaum/connectors/sql/_sql.py +6 -1
meerschaum/connectors/valkey/_pipes.py +12 -1
meerschaum/core/Pipe/__init__.py +23 -13
meerschaum/core/Pipe/_attributes.py +19 -0
meerschaum/core/Pipe/_dtypes.py +1 -1
meerschaum/core/Pipe/_sync.py +61 -21
meerschaum/core/Pipe/_verify.py +8 -7
meerschaum/plugins/_Plugin.py +11 -14
meerschaum/utils/daemon/Daemon.py +18 -11
meerschaum/utils/dataframe.py +175 -13
meerschaum/utils/dtypes/__init__.py +103 -14
meerschaum/utils/dtypes/sql.py +26 -0
meerschaum/utils/misc.py +8 -8
meerschaum/utils/sql.py +64 -11
meerschaum/utils/venv/_Venv.py +4 -4
meerschaum/utils/venv/__init__.py +33 -13
{meerschaum-2.6.17.dist-info → meerschaum-2.7.0rc1.dist-info}/METADATA +1 -1
{meerschaum-2.6.17.dist-info → meerschaum-2.7.0rc1.dist-info}/RECORD +31 -31
{meerschaum-2.6.17.dist-info → meerschaum-2.7.0rc1.dist-info}/LICENSE +0 -0
{meerschaum-2.6.17.dist-info → meerschaum-2.7.0rc1.dist-info}/NOTICE +0 -0
{meerschaum-2.6.17.dist-info → meerschaum-2.7.0rc1.dist-info}/WHEEL +0 -0
{meerschaum-2.6.17.dist-info → meerschaum-2.7.0rc1.dist-info}/entry_points.txt +0 -0
{meerschaum-2.6.17.dist-info → meerschaum-2.7.0rc1.dist-info}/top_level.txt +0 -0
{meerschaum-2.6.17.dist-info → meerschaum-2.7.0rc1.dist-info}/zip-safe +0 -0

meerschaum/utils/dataframe.py CHANGED Viewed

@@ -139,7 +139,6 @@ def filter_unseen_df(
     import functools
     import traceback
     from decimal import Decimal
-    from uuid import UUID
     from meerschaum.utils.warnings import warn
     from meerschaum.utils.packages import import_pandas, attempt_import
     from meerschaum.utils.dtypes import (
@@ -147,6 +146,7 @@ def filter_unseen_df(
         are_dtypes_equal,
         attempt_cast_to_numeric,
         attempt_cast_to_uuid,
+        attempt_cast_to_bytes,
         coerce_timezone,
     )
     pd = import_pandas(debug=debug)
@@ -333,6 +333,11 @@ def filter_unseen_df(
     old_uuid_cols = get_uuid_cols(old_df)
     new_uuid_cols = get_uuid_cols(new_df)
     uuid_cols = set(new_uuid_cols + old_uuid_cols)
+    old_bytes_cols = get_bytes_cols(old_df)
+    new_bytes_cols = get_bytes_cols(new_df)
+    bytes_cols = set(new_bytes_cols + old_bytes_cols)
     joined_df = merge(
         new_df.infer_objects(copy=False).fillna(NA),
         old_df.infer_objects(copy=False).fillna(NA),
@@ -368,6 +373,14 @@ def filter_unseen_df(
         except Exception:
             warn(f"Unable to parse numeric column '{uuid_col}':\n{traceback.format_exc()}")
+    for bytes_col in bytes_cols:
+        if bytes_col not in delta_df.columns:
+            continue
+        try:
+            delta_df[bytes_col] = delta_df[bytes_col].apply(attempt_cast_to_bytes)
+        except Exception:
+            warn(f"Unable to parse bytes column '{bytes_col}':\n{traceback.format_exc()}")
     return delta_df
@@ -429,6 +442,7 @@ def parse_df_datetimes(
     from meerschaum.utils.debug import dprint
     from meerschaum.utils.warnings import warn
     from meerschaum.utils.misc import items_str
+    from meerschaum.utils.dtypes import to_datetime
     import traceback
     pd = import_pandas()
     pandas = attempt_import('pandas')
@@ -494,7 +508,7 @@ def parse_df_datetimes(
     if len(cols_to_inspect) == 0:
         if debug:
-            dprint(f"All columns are ignored, skipping datetime detection...")
+            dprint("All columns are ignored, skipping datetime detection...")
         return df.fillna(pandas.NA)
     ### apply regex to columns to determine which are ISO datetimes
@@ -515,14 +529,10 @@ def parse_df_datetimes(
     try:
         if not using_dask:
-            df[datetime_cols] = df[datetime_cols].apply(
-                pd.to_datetime,
-                utc=True,
-                format='ISO8601',
-            )
+            df[datetime_cols] = df[datetime_cols].apply(to_datetime)
         else:
             df[datetime_cols] = df[datetime_cols].apply(
-                pd.to_datetime,
+                to_datetime,
                 utc=True,
                 axis=1,
                 meta={
@@ -665,7 +675,7 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
     Returns
     -------
-    A list of columns to treat as numerics.
+    A list of columns to treat as UUIDs.
     """
     if df is None:
         return []
@@ -692,6 +702,135 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
     ]
+def get_datetime_cols(
+    df: 'pd.DataFrame',
+    timezone_aware: bool = True,
+    timezone_naive: bool = True,
+) -> List[str]:
+    """
+    Get the columns which contain `datetime` or `Timestamp` objects from a Pandas DataFrame.
+    Parameters
+    ----------
+    df: pd.DataFrame
+        The DataFrame which may contain `datetime` or `Timestamp` objects.
+    timezone_aware: bool, default True
+        If `True`, include timezone-aware datetime columns.
+    timezone_naive: bool, default True
+        If `True`, include timezone-naive datetime columns.
+    Returns
+    -------
+    A list of columns to treat as datetimes.
+    """
+    if not timezone_aware and not timezone_naive:
+        raise ValueError("`timezone_aware` and `timezone_naive` cannot both be `False`.")
+    if df is None:
+        return []
+    from datetime import datetime
+    from meerschaum.utils.dtypes import are_dtypes_equal
+    is_dask = 'dask' in df.__module__
+    if is_dask:
+        df = get_first_valid_dask_partition(df)
+    known_dt_cols = [
+        col
+        for col, typ in df.dtypes.items()
+        if are_dtypes_equal('datetime', str(typ))
+    ]
+    if len(df) == 0:
+        return known_dt_cols
+    cols_indices = {
+        col: df[col].first_valid_index()
+        for col in df.columns
+        if col not in known_dt_cols
+    }
+    pydt_cols = [
+        col
+        for col, ix in cols_indices.items()
+        if (
+            ix is not None
+            and
+            isinstance(df.loc[ix][col], datetime)
+        )
+    ]
+    dt_cols_set = set(known_dt_cols + pydt_cols)
+    all_dt_cols = [
+        col
+        for col in df.columns
+        if col in dt_cols_set
+    ]
+    if timezone_aware and timezone_naive:
+        return all_dt_cols
+    known_timezone_aware_dt_cols = [
+        col
+        for col in known_dt_cols
+        if getattr(df[col], 'tz', None) is not None
+    ]
+    timezone_aware_pydt_cols = [
+        col
+        for col in pydt_cols
+        if df.loc[cols_indices[col]][col].tzinfo is not None
+    ]
+    timezone_aware_dt_cols_set = set(known_timezone_aware_dt_cols + timezone_aware_pydt_cols)
+    if timezone_aware:
+        return [
+            col
+            for col in all_dt_cols
+            if col in timezone_aware_pydt_cols
+        ]
+    return [
+        col
+        for col in all_dt_cols
+        if col not in timezone_aware_dt_cols_set
+    ]
+def get_bytes_cols(df: 'pd.DataFrame') -> List[str]:
+    """
+    Get the columns which contain bytes strings from a Pandas DataFrame.
+    Parameters
+    ----------
+    df: pd.DataFrame
+        The DataFrame which may contain bytes strings.
+    Returns
+    -------
+    A list of columns to treat as bytes.
+    """
+    if df is None:
+        return []
+    is_dask = 'dask' in df.__module__
+    if is_dask:
+        df = get_first_valid_dask_partition(df)
+    if len(df) == 0:
+        return []
+    cols_indices = {
+        col: df[col].first_valid_index()
+        for col in df.columns
+    }
+    return [
+        col
+        for col, ix in cols_indices.items()
+        if (
+            ix is not None
+            and
+            isinstance(df.loc[ix][col], bytes)
+        )
+    ]
 def enforce_dtypes(
     df: 'pd.DataFrame',
     dtypes: Dict[str, str],
@@ -743,6 +882,7 @@ def enforce_dtypes(
         is_dtype_numeric,
         attempt_cast_to_numeric,
         attempt_cast_to_uuid,
+        attempt_cast_to_bytes,
         coerce_timezone as _coerce_timezone,
     )
     pandas = mrsm.attempt_import('pandas')
@@ -773,6 +913,11 @@ def enforce_dtypes(
         for col, typ in dtypes.items()
         if typ == 'uuid'
     ]
+    bytes_cols = [
+        col
+        for col, typ in dtypes.items()
+        if typ == 'bytes'
+    ]
     datetime_cols = [
         col
         for col, typ in dtypes.items()
@@ -826,6 +971,17 @@ def enforce_dtypes(
                     if debug:
                         dprint(f"Unable to parse column '{col}' as UUID:\n{e}")
+    if bytes_cols:
+        if debug:
+            dprint(f"Checking for bytes: {bytes_cols}")
+        for col in bytes_cols:
+            if col in df.columns:
+                try:
+                    df[col] = df[col].apply(attempt_cast_to_bytes)
+                except Exception as e:
+                    if debug:
+                        dprint(f"Unable to parse column '{col}' as bytes:\n{e}")
     if datetime_cols and coerce_timezone:
         if debug:
             dprint(f"Checking for datetime conversion: {datetime_cols}")
@@ -931,6 +1087,8 @@ def get_datetime_bound_from_df(
     -------
     The minimum or maximum datetime value in the dataframe, or `None`.
     """
+    from meerschaum.utils.dtypes import to_datetime, value_is_null
     if df is None:
         return None
     if not datetime_column:
@@ -982,9 +1140,9 @@ def get_datetime_bound_from_df(
             dt_val = dt_val.compute()
         return (
-            pandas.to_datetime(dt_val).to_pydatetime()
+            to_datetime(dt_val, as_pydatetime=True)
             if are_dtypes_equal(str(type(dt_val)), 'datetime')
-            else (dt_val if dt_val is not pandas.NA else None)
+            else (dt_val if not value_is_null(dt_val) else None)
         )
     return None
@@ -1127,7 +1285,7 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
     for partition in ddf.partitions:
         try:
             pdf = partition.compute()
-        except Exception as e:
+        except Exception:
             continue
         if len(pdf) > 0:
             return pdf
@@ -1408,12 +1566,16 @@ def to_json(
     A JSON string.
     """
     from meerschaum.utils.packages import import_pandas
+    from meerschaum.utils.dtypes import serialize_bytes
     pd = import_pandas()
     uuid_cols = get_uuid_cols(df)
-    if uuid_cols and safe_copy:
+    bytes_cols = get_bytes_cols(df)
+    if safe_copy and bool(uuid_cols or bytes_cols):
         df = df.copy()
     for col in uuid_cols:
         df[col] = df[col].astype(str)
+    for col in bytes_cols:
+        df[col] = df[col].apply(serialize_bytes)
     return df.infer_objects(copy=False).fillna(pd.NA).to_json(
         date_format=date_format,
         date_unit=date_unit,

meerschaum/utils/dtypes/__init__.py CHANGED Viewed

@@ -15,7 +15,19 @@ import meerschaum as mrsm
 from meerschaum.utils.typing import Dict, Union, Any
 from meerschaum.utils.warnings import warn
-MRSM_PD_DTYPES: Dict[str, str] = {
+MRSM_ALIAS_DTYPES: Dict[str, str] = {
+    'decimal': 'numeric',
+    'number': 'numeric',
+    'jsonl': 'json',
+    'JSON': 'json',
+    'binary': 'bytes',
+    'blob': 'bytes',
+    'varbinary': 'bytes',
+    'bytea': 'bytes',
+    'guid': 'uuid',
+    'UUID': 'uuid',
+}
+MRSM_PD_DTYPES: Dict[Union[str, None], str] = {
     'json': 'object',
     'numeric': 'object',
     'uuid': 'object',
@@ -27,6 +39,8 @@ MRSM_PD_DTYPES: Dict[str, str] = {
     'int32': 'Int32',
     'int64': 'Int64',
     'str': 'string[python]',
+    'bytes': 'object',
+    None: 'object',
 }
@@ -38,6 +52,10 @@ def to_pandas_dtype(dtype: str) -> str:
     if known_dtype is not None:
         return known_dtype
+    alias_dtype = MRSM_ALIAS_DTYPES.get(dtype, None)
+    if alias_dtype is not None:
+        return MRSM_PD_DTYPES[alias_dtype]
     ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
     ### treat it as a SQL db type.
     if dtype.split(' ')[0].isupper():
@@ -95,7 +113,7 @@ def are_dtypes_equal(
     try:
         if ldtype == rdtype:
             return True
-    except Exception as e:
+    except Exception:
         warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
         return False
@@ -115,6 +133,10 @@ def are_dtypes_equal(
     if ldtype in uuid_dtypes and rdtype in uuid_dtypes:
         return True
+    bytes_dtypes = ('bytes', 'object')
+    if ldtype in bytes_dtypes and rdtype in bytes_dtypes:
+        return True
     ldtype_clean = ldtype.split('[', maxsplit=1)[0]
     rdtype_clean = rdtype.split('[', maxsplit=1)[0]
@@ -185,7 +207,7 @@ def attempt_cast_to_numeric(value: Any) -> Any:
             if not value_is_null(value)
             else Decimal('NaN')
         )
-    except Exception as e:
+    except Exception:
         return value
@@ -201,7 +223,23 @@ def attempt_cast_to_uuid(value: Any) -> Any:
             if not value_is_null(value)
             else None
         )
-    except Exception as e:
+    except Exception:
+        return value
+def attempt_cast_to_bytes(value: Any) -> Any:
+    """
+    Given a value, attempt to coerce it into a bytestring.
+    """
+    if isinstance(value, uuid.UUID):
+        return value
+    try:
+        return (
+            deserialize_base64(str(value))
+            if not value_is_null(value)
+            else None
+        )
+    except Exception:
         return value
@@ -251,7 +289,7 @@ def coerce_timezone(
 ) -> Any:
     """
     Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
-    return a naive datetime in terms of UTC.
+    return a UTC timestamp (strip timezone if `strip_utc` is `True`.
     """
     if dt is None:
         return None
@@ -266,9 +304,7 @@ def coerce_timezone(
     dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__')
     if dt_is_series:
-        is_dask = 'dask' in dt.__module__
         pandas = mrsm.attempt_import('pandas', lazy=False)
-        dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
         if (
             pandas.api.types.is_datetime64_any_dtype(dt) and (
@@ -279,14 +315,13 @@ def coerce_timezone(
         ):
             return dt
-        dt_series = (
-            pandas.to_datetime(dt, utc=True, format='ISO8601')
-            if dd is None
-            else dd.to_datetime(dt, utc=True, format='ISO8601')
-        )
+        dt_series = to_datetime(dt, coerce_utc=False)
         if strip_utc:
-            if dt_series.dt.tz is not None:
-                dt_series = dt_series.dt.tz_localize(None)
+            try:
+                if dt_series.dt.tz is not None:
+                    dt_series = dt_series.dt.tz_localize(None)
+            except Exception:
+                pass
         return dt_series
@@ -299,3 +334,57 @@ def coerce_timezone(
     if strip_utc:
         return utc_dt.replace(tzinfo=None)
     return utc_dt
+def to_datetime(dt_val: Any, as_pydatetime: bool = False, coerce_utc: bool = True) -> Any:
+    """
+    Wrap `pd.to_datetime()` and add support for out-of-bounds values.
+    """
+    pandas, dateutil_parser = mrsm.attempt_import('pandas', 'dateutil.parser', lazy=False)
+    is_dask = 'dask' in getattr(dt_val, '__module__', '')
+    dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
+    dt_is_series = hasattr(dt_val, 'dtype') and hasattr(dt_val, '__module__')
+    pd = pandas if dd is None else dd
+    try:
+        new_dt_val = pd.to_datetime(dt_val, utc=True, format='ISO8601')
+        if as_pydatetime:
+            return new_dt_val.to_pydatetime()
+        return new_dt_val
+    except (pd.errors.OutOfBoundsDatetime, ValueError):
+        pass
+    def parse(x: Any) -> Any:
+        try:
+            return dateutil_parser.parse(x)
+        except Exception:
+            return x
+    if dt_is_series:
+        new_series = dt_val.apply(parse)
+        if coerce_utc:
+            return coerce_timezone(new_series)
+        return new_series
+    new_dt_val = parse(dt_val)
+    if not coerce_utc:
+        return new_dt_val
+    return coerce_timezone(new_dt_val)
+def serialize_bytes(data: bytes) -> str:
+    """
+    Return the given bytes as a base64-encoded string.
+    """
+    import base64
+    if not isinstance(data, bytes) and value_is_null(data):
+        return data
+    return base64.b64encode(data).decode('utf-8')
+def deserialize_base64(data: str) -> bytes:
+    """
+    Return the original bytestring from the given base64-encoded string.
+    """
+    import base64
+    return base64.b64decode(data)

meerschaum/utils/dtypes/sql.py CHANGED Viewed

@@ -276,6 +276,19 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
         'cockroachdb': 'UUID',
         'default': 'TEXT',
     },
+    'bytes': {
+        'timescaledb': 'BYTEA',
+        'postgresql': 'BYTEA',
+        'mariadb': 'BLOB',
+        'mysql': 'BLOB',
+        'mssql': 'VARBINARY(MAX)',
+        'oracle': 'BLOB',
+        'sqlite': 'BLOB',
+        'duckdb': 'BLOB',
+        'citus': 'BYTEA',
+        'cockroachdb': 'BYTEA',
+        'default': 'BLOB',
+    },
 }
 PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
     'int': {
@@ -421,6 +434,19 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
         'cockroachdb': 'Uuid',
         'default': 'Uuid',
     },
+    'bytes': {
+        'timescaledb': 'LargeBinary',
+        'postgresql': 'LargeBinary',
+        'mariadb': 'LargeBinary',
+        'mysql': 'LargeBinary',
+        'mssql': 'LargeBinary',
+        'oracle': 'LargeBinary',
+        'sqlite': 'LargeBinary',
+        'duckdb': 'LargeBinary',
+        'citus': 'LargeBinary',
+        'cockroachdb': 'LargeBinary',
+        'default': 'LargeBinary',
+    },
 }
 AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = {

meerschaum/utils/misc.py CHANGED Viewed

@@ -177,14 +177,14 @@ def string_to_dict(
         keys = _keys[:-1]
         try:
             val = ast.literal_eval(_keys[-1])
-        except Exception as e:
+        except Exception:
             val = str(_keys[-1])
         c = params_dict
         for _k in keys[:-1]:
             try:
                 k = ast.literal_eval(_k)
-            except Exception as e:
+            except Exception:
                 k = str(_k)
             if k not in c:
                 c[k] = {}
@@ -196,12 +196,12 @@ def string_to_dict(
 def parse_config_substitution(
-        value: str,
-        leading_key: str = 'MRSM',
-        begin_key: str = '{',
-        end_key: str = '}',
-        delimeter: str = ':'
-    ) -> List[Any]:
+    value: str,
+    leading_key: str = 'MRSM',
+    begin_key: str = '{',
+    end_key: str = '}',
+    delimeter: str = ':'
+) -> List[Any]:
     """
     Parse Meerschaum substitution syntax
     E.g. MRSM{value1:value2} => ['value1', 'value2']

meerschaum 2.6.17__py3-none-any.whl → 2.7.0rc1__py3-none-any.whl

meerschaum 2.6.17py3-none-any.whl → 2.7.0rc1py3-none-any.whl