PyPI - meerschaum - Versions diffs - 2.5.1__py3-none-any.whl → 2.6.0.dev1__py3-none-any.whl - Mend

meerschaum 2.5.1py3-none-any.whl → 2.6.0.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

meerschaum/_internal/arguments/_parser.py +6 -1
meerschaum/actions/edit.py +6 -6
meerschaum/actions/sql.py +12 -11
meerschaum/config/_edit.py +46 -19
meerschaum/config/_read_config.py +20 -9
meerschaum/config/_version.py +1 -1
meerschaum/config/stack/__init__.py +1 -1
meerschaum/connectors/sql/_pipes.py +80 -24
meerschaum/connectors/sql/_sql.py +29 -10
meerschaum/connectors/valkey/_pipes.py +1 -1
meerschaum/core/Pipe/__init__.py +8 -9
meerschaum/core/Pipe/_attributes.py +33 -11
meerschaum/core/Pipe/_data.py +26 -7
meerschaum/core/Pipe/_dtypes.py +4 -4
meerschaum/core/Pipe/_fetch.py +1 -1
meerschaum/core/Pipe/_sync.py +16 -4
meerschaum/core/Pipe/_verify.py +1 -1
meerschaum/utils/dataframe.py +56 -29
meerschaum/utils/dtypes/__init__.py +16 -5
meerschaum/utils/dtypes/sql.py +58 -28
meerschaum/utils/misc.py +49 -16
meerschaum/utils/sql.py +224 -40
{meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/METADATA +1 -1
{meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/RECORD +30 -30
{meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/WHEEL +1 -1
{meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/LICENSE +0 -0
{meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/NOTICE +0 -0
{meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/entry_points.txt +0 -0
{meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/top_level.txt +0 -0
{meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/zip-safe +0 -0

meerschaum/core/Pipe/_attributes.py CHANGED Viewed

@@ -103,10 +103,25 @@ def indices(self) -> Union[Dict[str, Union[str, List[str]]], None]:
     if indices_key not in self.parameters:
         self.parameters[indices_key] = {}
     _indices = self.parameters[indices_key]
+    _columns = self.columns
+    dt_col = _columns.get('datetime', None)
     if not isinstance(_indices, dict):
         _indices = {}
         self.parameters[indices_key] = _indices
-    return {**self.columns, **_indices}
+    unique_cols = (
+        [dt_col]
+        if dt_col
+        else []
+    ) + [
+        col
+        for col_ix, col in _columns.items()
+        if col_ix != 'datetime'
+    ]
+    return {
+        **({'unique': unique_cols} if len(unique_cols) > 1 else {}),
+        **_columns,
+        **_indices
+    }
 @property
@@ -196,7 +211,7 @@ def get_columns(self, *args: str, error: bool = False) -> Union[str, Tuple[str]]
     ----------
     *args: str
         The column names to be retrieved.
     error: bool, default False
         If `True`, raise an `Exception` if the specified column is not defined.
@@ -509,15 +524,22 @@ def get_indices(self) -> Dict[str, str]:
         if cols
     }
     _index_names = {
-        ix: (
-            _index_template.format(
-                target=_target,
-                column_names=column_names,
-                connector_keys=self.connector_keys,
-                metric_key=self.connector_key,
-                location_key=self.location_key,
-            )
+        ix: _index_template.format(
+            target=_target,
+            column_names=column_names,
+            connector_keys=self.connector_keys,
+            metric_key=self.connector_key,
+            location_key=self.location_key,
         )
         for ix, column_names in _column_names.items()
     }
-    return _index_names
+    ### NOTE: Skip any duplicate indices.
+    seen_index_names = {}
+    for ix, index_name in _index_names.items():
+        if index_name in seen_index_names:
+            continue
+        seen_index_names[index_name] = ix
+    return {
+        ix: index_name
+        for index_name, ix in seen_index_names.items()
+    }

meerschaum/core/Pipe/_data.py CHANGED Viewed

@@ -23,8 +23,8 @@ def get_data(
     self,
     select_columns: Optional[List[str]] = None,
     omit_columns: Optional[List[str]] = None,
-    begin: Union[datetime, int, None] = None,
-    end: Union[datetime, int, None] = None,
+    begin: Union[datetime, int, str, None] = None,
+    end: Union[datetime, int, str, None] = None,
     params: Optional[Dict[str, Any]] = None,
     as_iterator: bool = False,
     as_chunks: bool = False,
@@ -48,12 +48,12 @@ def get_data(
     omit_columns: Optional[List[str]], default None
         If provided, remove these columns from the selection.
-    begin: Union[datetime, int, None], default None
+    begin: Union[datetime, int, str, None], default None
         Lower bound datetime to begin searching for data (inclusive).
         Translates to a `WHERE` clause like `WHERE datetime >= begin`.
         Defaults to `None`.
-    end: Union[datetime, int, None], default None
+    end: Union[datetime, int, str, None], default None
         Upper bound datetime to stop searching for data (inclusive).
         Translates to a `WHERE` clause like `WHERE datetime < end`.
         Defaults to `None`.
@@ -105,11 +105,12 @@ def get_data(
     from meerschaum.utils.venv import Venv
     from meerschaum.connectors import get_connector_plugin
     from meerschaum.utils.misc import iterate_chunks, items_str
-    from meerschaum.utils.dtypes import to_pandas_dtype
+    from meerschaum.utils.dtypes import to_pandas_dtype, coerce_timezone
     from meerschaum.utils.dataframe import add_missing_cols_to_df, df_is_chunk_generator
     from meerschaum.utils.packages import attempt_import
     dd = attempt_import('dask.dataframe') if as_dask else None
     dask = attempt_import('dask') if as_dask else None
+    dateutil_parser = attempt_import('dateutil.parser')
     if select_columns == '*':
         select_columns = None
@@ -120,11 +121,29 @@ def get_data(
         omit_columns = [omit_columns]
     as_iterator = as_iterator or as_chunks
+    dt_col = self.columns.get('datetime', None)
+    dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
+    dt_is_utc = 'utc' in dt_typ.lower()
+    if isinstance(begin, str):
+        try:
+            begin = dateutil_parser.parse(begin)
+        except Exception as e:
+            warn(f"Failed to parse '{begin}' as datetime:\n{e}")
+            begin = None
+    if isinstance(end, str):
+        try:
+            end = dateutil_parser.parse(end)
+        except Exception as e:
+            warn(f"Failed to parse '{end}' as datetime:\n{e}")
+            end = None
+    if isinstance(begin, datetime):
+        begin = coerce_timezone(begin, strip_utc=(not dt_is_utc))
+    if isinstance(end, datetime):
+        end = coerce_timezone(end, strip_utc=(not dt_is_utc))
     def _sort_df(_df):
         if df_is_chunk_generator(_df):
             return _df
-        dt_col = self.columns.get('datetime', None)
         indices = [] if dt_col not in _df.columns else [dt_col]
         non_dt_cols = [
             col
@@ -607,7 +626,7 @@ def get_chunk_interval(
     if dt_col is None:
         return timedelta(minutes=chunk_minutes)
-    dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
+    dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
     if 'int' in dt_dtype.lower():
         return chunk_minutes
     return timedelta(minutes=chunk_minutes)

meerschaum/core/Pipe/_dtypes.py CHANGED Viewed

@@ -101,18 +101,18 @@ def infer_dtypes(self, persist: bool = False, debug: bool = False) -> Dict[str,
         dt_col = self.columns.get('datetime', None)
         if dt_col:
             if not self.parameters.get('dtypes', {}).get(dt_col, None):
-                dtypes[dt_col] = 'datetime64[ns]'
+                dtypes[dt_col] = 'datetime64[ns, UTC]'
         return dtypes
-    from meerschaum.utils.sql import get_pd_type
-    from meerschaum.utils.misc import to_pandas_dtype
+    from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
+    from meerschaum.utils.dtypes import to_pandas_dtype
     columns_types = self.get_columns_types(debug=debug)
     ### NOTE: get_columns_types() may return either the types as
     ###       PostgreSQL- or Pandas-style.
     dtypes = {
         c: (
-            get_pd_type(t, allow_custom_dtypes=True)
+            get_pd_type_from_db_type(t, allow_custom_dtypes=True)
             if str(t).isupper()
             else to_pandas_dtype(t)
         )

meerschaum/core/Pipe/_fetch.py CHANGED Viewed

@@ -125,7 +125,7 @@ def get_backtrack_interval(
     if dt_col is None:
         return backtrack_interval
-    dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
+    dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
     if 'int' in dt_dtype.lower():
         return backtrack_minutes

meerschaum/core/Pipe/_sync.py CHANGED Viewed

@@ -624,6 +624,18 @@ def filter_existing(
         merge = pd.merge
         NA = pd.NA
+    primary_key = self.columns.get('primary', None)
+    autoincrement = self.parameters.get('autoincrement', False)
+    pipe_columns = self.columns.copy()
+    if primary_key and autoincrement and df is not None and primary_key in df.columns:
+        if safe_copy:
+            df = df.copy()
+            safe_copy = False
+        if df[primary_key].isnull().all():
+            del df[primary_key]
+            _ = self.columns.pop(primary_key, None)
     def get_empty_df():
         empty_df = pd.DataFrame([])
         dtypes = dict(df.dtypes) if df is not None else {}
@@ -643,8 +655,8 @@ def filter_existing(
     ### begin is the oldest data in the new dataframe
     begin, end = None, None
-    dt_col = self.columns.get('datetime', None)
-    dt_type = self.dtypes.get(dt_col, 'datetime64[ns]') if dt_col else None
+    dt_col = pipe_columns.get('datetime', None)
+    dt_type = self.dtypes.get(dt_col, 'datetime64[ns, UTC]') if dt_col else None
     try:
         min_dt_val = df[dt_col].min(skipna=True) if dt_col else None
         if is_dask and min_dt_val is not None:
@@ -713,7 +725,7 @@ def filter_existing(
     unique_index_vals = {
         col: df[col].unique()
-        for col in self.columns
+        for col in pipe_columns
         if col in df.columns and col != dt_col
     } if not date_bound_only else {}
     filter_params_index_limit = get_config('pipes', 'sync', 'filter_params_index_limit')
@@ -749,7 +761,7 @@ def filter_existing(
     ### Separate new rows from changed ones.
     on_cols = [
-        col for col_key, col in self.columns.items()
+        col for col_key, col in pipe_columns.items()
         if (
             col
             and

meerschaum/core/Pipe/_verify.py CHANGED Viewed

@@ -394,7 +394,7 @@ def get_bound_interval(self, debug: bool = False) -> Union[timedelta, int, None]
     if not dt_col:
         return bound_time_value
-    dt_typ = self.dtypes.get(dt_col, 'datetime64[ns]')
+    dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
     if 'int' in dt_typ.lower():
         return int(bound_time_value)

meerschaum/utils/dataframe.py CHANGED Viewed

@@ -94,14 +94,14 @@ def filter_unseen_df(
     ----------
     old_df: 'pd.DataFrame'
         The original (target) dataframe. Acts as a filter on the `new_df`.
     new_df: 'pd.DataFrame'
         The fetched (source) dataframe. Rows that are contained in `old_df` are removed.
     safe_copy: bool, default True
         If `True`, create a copy before comparing and modifying the dataframes.
         Setting to `False` may mutate the DataFrames.
     dtypes: Optional[Dict[str, Any]], default None
         Optionally specify the datatypes of the dataframe.
@@ -234,8 +234,10 @@ def filter_unseen_df(
     cast_dt_cols = True
     try:
         for col, typ in dt_dtypes.items():
-            tz = typ.split(',')[-1].strip() if ',' in typ else None
-            new_df[col] = coerce_timezone(pd.to_datetime(new_df[col], utc=True))
+            if col in old_df.columns:
+                old_df[col] = coerce_timezone(pd.to_datetime(old_df[col], utc=True))
+            if col in new_df.columns:
+                new_df[col] = coerce_timezone(pd.to_datetime(new_df[col], utc=True))
         cast_dt_cols = False
     except Exception as e:
         warn(f"Could not cast datetime columns:\n{e}")
@@ -363,6 +365,7 @@ def filter_unseen_df(
 def parse_df_datetimes(
     df: 'pd.DataFrame',
     ignore_cols: Optional[Iterable[str]] = None,
+    strip_timezone: bool = True,
     chunksize: Optional[int] = None,
     dtype_backend: str = 'numpy_nullable',
     debug: bool = False,
@@ -378,6 +381,9 @@ def parse_df_datetimes(
     ignore_cols: Optional[Iterable[str]], default None
         If provided, do not attempt to coerce these columns as datetimes.
+    strip_timezone: bool, default True
+        If `True`, remove the UTC `tzinfo` property.
     chunksize: Optional[int], default None
         If the pandas implementation is `'dask'`, use this chunksize for the distributed dataframe.
@@ -385,7 +391,7 @@ def parse_df_datetimes(
         If `df` is not a DataFrame and new one needs to be constructed,
         use this as the datatypes backend.
         Accepted values are 'numpy_nullable' and 'pyarrow'.
     debug: bool, default False
         Verbosity toggle.
@@ -447,7 +453,7 @@ def parse_df_datetimes(
                             for doc in df
                         ] for k in keys
                     },
-                    npartitions = npartitions,
+                    npartitions=npartitions,
                 )
             elif isinstance(df, dict):
                 df = pd.DataFrame.from_dict(df, npartitions=npartitions)
@@ -500,14 +506,18 @@ def parse_df_datetimes(
     try:
         if not using_dask:
-            df[datetime_cols] = df[datetime_cols].apply(pd.to_datetime, utc=True)
+            df[datetime_cols] = df[datetime_cols].apply(
+                pd.to_datetime,
+                utc=True,
+                format='ISO8601',
+            )
         else:
             df[datetime_cols] = df[datetime_cols].apply(
                 pd.to_datetime,
                 utc=True,
                 axis=1,
                 meta={
-                    col: 'datetime64[ns]'
+                    col: 'datetime64[ns, UTC]'
                     for col in datetime_cols
                 }
             )
@@ -517,11 +527,15 @@ def parse_df_datetimes(
             + f"{traceback.format_exc()}"
         )
-    for dt in datetime_cols:
-        try:
-            df[dt] = df[dt].dt.tz_localize(None)
-        except Exception:
-            warn(f"Unable to convert column '{dt}' to naive datetime:\n{traceback.format_exc()}")
+    if strip_timezone:
+        for dt in datetime_cols:
+            try:
+                df[dt] = df[dt].dt.tz_localize(None)
+            except Exception:
+                warn(
+                    f"Unable to convert column '{dt}' to naive datetime:\n"
+                    + f"{traceback.format_exc()}"
+                )
     return df
@@ -674,6 +688,7 @@ def enforce_dtypes(
     dtypes: Dict[str, str],
     safe_copy: bool = True,
     coerce_numeric: bool = True,
+    coerce_timezone: bool = True,
     debug: bool = False,
 ) -> 'pd.DataFrame':
     """
@@ -695,6 +710,9 @@ def enforce_dtypes(
     coerce_numeric: bool, default True
         If `True`, convert float and int collisions to numeric.
+    coerce_timezone: bool, default True
+        If `True`, convert datetimes to UTC.
     debug: bool, default False
         Verbosity toggle.
@@ -703,20 +721,15 @@ def enforce_dtypes(
     The Pandas DataFrame with the types enforced.
     """
     import json
-    import traceback
-    from decimal import Decimal
     from meerschaum.utils.debug import dprint
-    from meerschaum.utils.warnings import warn
     from meerschaum.utils.formatting import pprint
-    from meerschaum.config.static import STATIC_CONFIG
-    from meerschaum.utils.packages import import_pandas
     from meerschaum.utils.dtypes import (
         are_dtypes_equal,
         to_pandas_dtype,
         is_dtype_numeric,
         attempt_cast_to_numeric,
         attempt_cast_to_uuid,
-        coerce_timezone,
+        coerce_timezone as _coerce_timezone,
     )
     if safe_copy:
         df = df.copy()
@@ -744,6 +757,11 @@ def enforce_dtypes(
         for col, typ in dtypes.items()
         if typ == 'uuid'
     ]
+    datetime_cols = [
+        col
+        for col, typ in dtypes.items()
+        if are_dtypes_equal(typ, 'datetime')
+    ]
     df_numeric_cols = get_numeric_cols(df)
     if debug:
         dprint("Desired data types:")
@@ -792,6 +810,12 @@ def enforce_dtypes(
                     if debug:
                         dprint(f"Unable to parse column '{col}' as UUID:\n{e}")
+    if datetime_cols and coerce_timezone:
+        if debug:
+            dprint(f"Checking for datetime conversion: {datetime_cols}")
+        for col in datetime_cols:
+            df[col] = _coerce_timezone(df[col])
     df_dtypes = {c: str(t) for c, t in df.dtypes.items()}
     if are_dtypes_equal(df_dtypes, pipe_pandas_dtypes):
         if debug:
@@ -826,8 +850,7 @@ def enforce_dtypes(
         if debug:
             dprint(
                 "The incoming DataFrame has mostly the same types, skipping enforcement."
-                + "The only detected difference was in the following datetime columns.\n"
-                + "    Timezone information may be stripped."
+                + "The only detected difference was in the following datetime columns."
             )
             pprint(detected_dt_cols)
         return df
@@ -930,11 +953,15 @@ def get_datetime_bound_from_df(
         if datetime_column not in df.columns:
             return None
-        dt_val = (
-            df[datetime_column].min(skipna=True)
-            if minimum else df[datetime_column].max(skipna=True)
-        )
-        if is_dask and dt_val is not None:
+        try:
+            dt_val = (
+                df[datetime_column].min(skipna=True)
+                if minimum
+                else df[datetime_column].max(skipna=True)
+            )
+        except Exception:
+            dt_val = pandas.NA
+        if is_dask and dt_val is not None and dt_val is not pandas.NA:
             dt_val = dt_val.compute()
         return (
@@ -1243,12 +1270,12 @@ def query_df(
             end_tz = end.tzinfo if end is not None else None
             if begin_tz is not None or end_tz is not None or df_tz is not None:
-                begin = coerce_timezone(begin)
-                end = coerce_timezone(end)
+                begin = coerce_timezone(begin, strip_utc=False)
+                end = coerce_timezone(end, strip_utc=False)
                 if df_tz is not None:
                     if debug:
                         dprint(f"Casting column '{datetime_column}' to UTC...")
-                    df[datetime_column] = coerce_timezone(df[datetime_column])
+                    df[datetime_column] = coerce_timezone(df[datetime_column], strip_utc=False)
                 dprint(f"Using datetime bounds:\n{begin=}\n{end=}")
     in_ex_params = get_in_ex_params(params)

meerschaum/utils/dtypes/__init__.py CHANGED Viewed

@@ -19,7 +19,7 @@ MRSM_PD_DTYPES: Dict[str, str] = {
     'json': 'object',
     'numeric': 'object',
     'uuid': 'object',
-    'datetime': 'datetime64[ns]',
+    'datetime': 'datetime64[ns, UTC]',
     'bool': 'bool[pyarrow]',
     'int': 'Int64',
     'int8': 'Int8',
@@ -245,7 +245,10 @@ def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal:
         return x
-def coerce_timezone(dt: Any) -> Any:
+def coerce_timezone(
+    dt: Any,
+    strip_utc: bool = False,
+) -> Any:
     """
     Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
     return a naive datetime in terms of UTC.
@@ -260,9 +263,17 @@ def coerce_timezone(dt: Any) -> Any:
     if dt_is_series:
         pandas = mrsm.attempt_import('pandas')
-        return pandas.to_datetime(dt, utc=True).apply(lambda x: x.replace(tzinfo=None))
+        dt_series = (
+            pandas.to_datetime(dt, utc=True)
+        )
+        if strip_utc:
+            dt_series = dt_series.apply(lambda x: x.replace(tzinfo=None))
+        return dt_series
     if dt.tzinfo is None:
-        return dt
+        if strip_utc:
+            return dt
+        return dt.replace(tzinfo=timezone.utc)
-    return dt.astimezone(timezone.utc).replace(tzinfo=None)
+    return dt.astimezone(timezone.utc)

meerschaum/utils/dtypes/sql.py CHANGED Viewed

@@ -7,7 +7,7 @@ Utility functions for working with SQL data types.
 """
 from __future__ import annotations
-from meerschaum.utils.typing import Dict, Union, Tuple
+from meerschaum.utils.typing import Dict, Union, Tuple, List
 NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
     'mariadb': (38, 20),
@@ -16,6 +16,7 @@ NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
     'duckdb': (15, 4),
     'sqlite': (15, 4),
 }
+TIMEZONE_NAIVE_FLAVORS = {'oracle', 'mysql', 'mariadb'}
 ### MySQL doesn't allow for casting as BIGINT, so this is a workaround.
 DB_FLAVORS_CAST_DTYPES = {
@@ -56,6 +57,7 @@ DB_FLAVORS_CAST_DTYPES = {
         'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)',
         'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)',
         'NVARCHAR': 'NVARCHAR(MAX)',
+        'BIT': 'INT',
     },
 }
 for _flavor, (_precision, _scale) in NUMERIC_PRECISION_FLAVORS.items():
@@ -78,7 +80,9 @@ DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
     'NUMBER': 'numeric',
     'NUMERIC': 'numeric',
     'TIMESTAMP': 'datetime64[ns]',
+    'TIMESTAMP WITHOUT TIMEZONE': 'datetime64[ns]',
     'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]',
+    'TIMESTAMP WITH TIME ZONE': 'datetime64[ns, UTC]',
     'TIMESTAMPTZ': 'datetime64[ns, UTC]',
     'DATE': 'datetime64[ns]',
     'DATETIME': 'datetime64[ns]',
@@ -160,7 +164,7 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
         'mariadb': 'DATETIME',
         'mysql': 'DATETIME',
         'mssql': 'DATETIME2',
-        'oracle': 'DATE',
+        'oracle': 'TIMESTAMP',
         'sqlite': 'DATETIME',
         'duckdb': 'TIMESTAMP',
         'citus': 'TIMESTAMP',
@@ -168,16 +172,16 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
         'default': 'DATETIME',
     },
     'datetime64[ns, UTC]': {
-        'timescaledb': 'TIMESTAMP',
-        'postgresql': 'TIMESTAMP',
-        'mariadb': 'TIMESTAMP',
-        'mysql': 'TIMESTAMP',
+        'timescaledb': 'TIMESTAMPTZ',
+        'postgresql': 'TIMESTAMPTZ',
+        'mariadb': 'TIMESTAMP WITH TIME ZONE',
+        'mysql': 'DATETIME',
         'mssql': 'DATETIMEOFFSET',
         'oracle': 'TIMESTAMP',
         'sqlite': 'TIMESTAMP',
-        'duckdb': 'TIMESTAMP',
-        'citus': 'TIMESTAMP',
-        'cockroachdb': 'TIMESTAMP',
+        'duckdb': 'TIMESTAMPTZ',
+        'citus': 'TIMESTAMPTZ',
+        'cockroachdb': 'TIMESTAMPTZ',
         'default': 'TIMESTAMP',
     },
     'bool': {
@@ -185,7 +189,7 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
         'postgresql': 'BOOLEAN',
         'mariadb': 'BOOLEAN',
         'mysql': 'BOOLEAN',
-        'mssql': 'INTEGER',
+        'mssql': 'BIT',
         'oracle': 'INTEGER',
         'sqlite': 'FLOAT',
         'duckdb': 'BOOLEAN',
@@ -301,24 +305,24 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
         'default': 'DateTime',
     },
     'datetime64[ns, UTC]': {
-        'timescaledb': 'DateTime',
-        'postgresql': 'DateTime',
-        'mariadb': 'DateTime',
-        'mysql': 'DateTime',
+        'timescaledb': 'DateTime(timezone=True)',
+        'postgresql': 'DateTime(timezone=True)',
+        'mariadb': 'DateTime(timezone=True)',
+        'mysql': 'DateTime(timezone=True)',
         'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET',
-        'oracle': 'DateTime',
-        'sqlite': 'DateTime',
-        'duckdb': 'DateTime',
-        'citus': 'DateTime',
-        'cockroachdb': 'DateTime',
-        'default': 'DateTime',
+        'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)',
+        'sqlite': 'DateTime(timezone=True)',
+        'duckdb': 'DateTime(timezone=True)',
+        'citus': 'DateTime(timezone=True)',
+        'cockroachdb': 'DateTime(timezone=True)',
+        'default': 'DateTime(timezone=True)',
     },
     'bool': {
         'timescaledb': 'Boolean',
         'postgresql': 'Boolean',
         'mariadb': 'Integer',
         'mysql': 'Integer',
-        'mssql': 'Integer',
+        'mssql': 'sqlalchemy.dialects.mssql.BIT',
         'oracle': 'Integer',
         'sqlite': 'Float',
         'duckdb': 'Boolean',
@@ -393,6 +397,20 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
     },
 }
+AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = {
+    'timescaledb': 'GENERATED BY DEFAULT AS IDENTITY',
+    'postgresql': 'GENERATED BY DEFAULT AS IDENTITY',
+    'mariadb': 'AUTO_INCREMENT',
+    'mysql': 'AUTO_INCREMENT',
+    'mssql': 'IDENTITY(1,1)',
+    'oracle': 'GENERATED BY DEFAULT ON NULL AS IDENTITY',
+    'sqlite': 'AUTOINCREMENT',
+    'duckdb': 'GENERATED BY DEFAULT',
+    'citus': 'GENERATED BY DEFAULT',
+    'cockroachdb': 'GENERATED BY DEFAULT AS IDENTITY',
+    'default': 'GENERATED BY DEFAULT AS IDENTITY',
+}
 def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
     """
@@ -456,10 +474,10 @@ def get_db_type_from_pd_type(
     The database data type for the incoming Pandas data type.
     If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
     """
-    import ast
     from meerschaum.utils.warnings import warn
     from meerschaum.utils.packages import attempt_import
     from meerschaum.utils.dtypes import are_dtypes_equal
+    from meerschaum.utils.misc import parse_arguments_str
     sqlalchemy_types = attempt_import('sqlalchemy.types')
     types_registry = (
@@ -512,15 +530,16 @@ def get_db_type_from_pd_type(
     if db_type.startswith('sqlalchemy.dialects'):
         dialect, typ_class_name = db_type.replace('sqlalchemy.dialects.', '').split('.', maxsplit=2)
-        arg = None
+        cls_args, cls_kwargs = None, None
         if '(' in typ_class_name:
-            typ_class_name, arg_str = typ_class_name.split('(', maxsplit=1)
-            arg = ast.literal_eval(arg_str.rstrip(')'))
+            typ_class_name, args_str = typ_class_name.split('(', maxsplit=1)
+            args_str = args_str.rstrip(')')
+            cls_args, cls_kwargs = parse_arguments_str(args_str)
         sqlalchemy_dialects_flavor_module = attempt_import(f'sqlalchemy.dialects.{dialect}')
         cls = getattr(sqlalchemy_dialects_flavor_module, typ_class_name)
-        if arg is None:
+        if cls_args is None:
             return cls
-        return cls(arg)
+        return cls(*cls_args, **cls_kwargs)
     if 'numeric' in db_type.lower():
         numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
@@ -528,4 +547,15 @@ def get_db_type_from_pd_type(
             return sqlalchemy_types.Numeric
         precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]
         return sqlalchemy_types.Numeric(precision, scale)
-    return getattr(sqlalchemy_types, db_type)
+    cls_args, cls_kwargs = None, None
+    typ_class_name = db_type
+    if '(' in db_type:
+        typ_class_name, args_str = db_type.split('(', maxsplit=1)
+        args_str = args_str.rstrip(')')
+        cls_args, cls_kwargs = parse_arguments_str(args_str)
+    cls = getattr(sqlalchemy_types, typ_class_name)
+    if cls_args is None:
+        return cls
+    return cls(*cls_args, **cls_kwargs)

meerschaum 2.5.1__py3-none-any.whl → 2.6.0.dev1__py3-none-any.whl

meerschaum 2.5.1py3-none-any.whl → 2.6.0.dev1py3-none-any.whl