PyPI - meerschaum - Versions diffs - 3.0.0rc1__py3-none-any.whl → 3.0.0rc2__py3-none-any.whl - Mend

meerschaum 3.0.0rc1py3-none-any.whl → 3.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

meerschaum/_internal/arguments/_parser.py +2 -1
meerschaum/_internal/docs/index.py +49 -2
meerschaum/_internal/static.py +8 -24
meerschaum/actions/verify.py +5 -8
meerschaum/api/__init__.py +2 -1
meerschaum/api/dash/__init__.py +0 -2
meerschaum/api/dash/callbacks/dashboard.py +1 -1
meerschaum/api/dash/tokens.py +2 -2
meerschaum/api/routes/_pipes.py +47 -37
meerschaum/config/_default.py +11 -1
meerschaum/config/_version.py +1 -1
meerschaum/config/stack/__init__.py +9 -8
meerschaum/connectors/api/_pipes.py +2 -18
meerschaum/connectors/api/_tokens.py +2 -2
meerschaum/connectors/instance/_tokens.py +4 -4
meerschaum/connectors/sql/_create_engine.py +3 -14
meerschaum/connectors/sql/_pipes.py +118 -163
meerschaum/connectors/sql/_sql.py +38 -20
meerschaum/connectors/valkey/_pipes.py +44 -16
meerschaum/core/Pipe/__init__.py +28 -5
meerschaum/core/Pipe/_attributes.py +270 -46
meerschaum/core/Pipe/_data.py +55 -17
meerschaum/core/Pipe/_dtypes.py +19 -4
meerschaum/core/Pipe/_edit.py +2 -0
meerschaum/core/Pipe/_fetch.py +1 -1
meerschaum/core/Pipe/_sync.py +90 -160
meerschaum/core/Pipe/_verify.py +3 -3
meerschaum/core/Token/_Token.py +3 -4
meerschaum/utils/dataframe.py +379 -68
meerschaum/utils/debug.py +15 -15
meerschaum/utils/dtypes/__init__.py +388 -22
meerschaum/utils/dtypes/sql.py +326 -30
meerschaum/utils/misc.py +9 -68
meerschaum/utils/packages/__init__.py +7 -21
meerschaum/utils/packages/_packages.py +7 -2
meerschaum/utils/schedule.py +1 -1
meerschaum/utils/sql.py +7 -7
{meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/METADATA +5 -17
{meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/RECORD +45 -44
meerschaum-3.0.0rc2.dist-info/licenses/NOTICE +2 -0
{meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/WHEEL +0 -0
{meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/entry_points.txt +0 -0
{meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/licenses/LICENSE +0 -0
{meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/top_level.txt +0 -0
{meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/zip-safe +0 -0

meerschaum/core/Pipe/_sync.py CHANGED Viewed

@@ -28,6 +28,7 @@ from meerschaum.utils.typing import (
     List,
 )
 from meerschaum.utils.warnings import warn, error
+from meerschaum._internal.static import STATIC_CONFIG
 if TYPE_CHECKING:
     pd = mrsm.attempt_import('pandas')
@@ -136,7 +137,7 @@ def sync(
     from meerschaum.utils.misc import df_is_chunk_generator, filter_keywords, filter_arguments
     from meerschaum.utils.pool import get_pool
     from meerschaum.config import get_config
-    from meerschaum.utils.dtypes import are_dtypes_equal
+    from meerschaum.utils.dtypes import are_dtypes_equal, get_current_timestamp
     if (callback is not None or error_callback is not None) and blocking:
         warn("Callback functions are only executed when blocking = False. Ignoring...")
@@ -164,8 +165,8 @@ def sync(
         'safe_copy': True,
     })
-    ### NOTE: Invalidate `_exists` cache before and after syncing.
-    self._exists = None
+    self._invalidate_cache(debug=debug)
+    self._sync_ts = get_current_timestamp('ms')
     def _sync(
         p: mrsm.Pipe,
@@ -178,7 +179,7 @@ def sync(
         ] = InferFetch,
     ) -> SuccessTuple:
         if df is None:
-            p._exists = None
+            p._invalidate_cache(debug=debug)
             return (
                 False,
                 f"You passed `None` instead of data into `sync()` for {p}.\n"
@@ -190,7 +191,7 @@ def sync(
             register_success, register_msg = p.register(debug=debug)
             if not register_success:
                 if 'already' not in register_msg:
-                    p._exists = None
+                    p._invalidate_cache(debug=debug)
                     return register_success, register_msg
         if isinstance(df, str):
@@ -211,10 +212,10 @@ def sync(
                     msg = f"{p} does not have a valid connector."
                     if p.connector_keys.startswith('plugin:'):
                         msg += f"\n    Perhaps {p.connector_keys} has a syntax error?"
-                    p._exists = None
+                    p._invalidate_cache(debug=debug)
                     return False, msg
             except Exception:
-                p._exists = None
+                p._invalidate_cache(debug=debug)
                 return False, f"Unable to create the connector for {p}."
             ### Sync in place if possible.
@@ -228,7 +229,7 @@ def sync(
                 get_config('system', 'experimental', 'inplace_sync')
             ):
                 with Venv(get_connector_plugin(self.instance_connector)):
-                    p._exists = None
+                    p._invalidate_cache(debug=debug)
                     _args, _kwargs = filter_arguments(
                         p.instance_connector.sync_pipe_inplace,
                         p,
@@ -251,7 +252,7 @@ def sync(
                             **kw
                         )
                         return_tuple = p.connector.sync(*_args, **_kwargs)
-                    p._exists = None
+                    p._invalidate_cache(debug=debug)
                     if not isinstance(return_tuple, tuple):
                         return_tuple = (
                             False,
@@ -264,7 +265,7 @@ def sync(
                 msg = f"Failed to sync {p} with exception: '" + str(e) + "'"
                 if debug:
                     error(msg, silent=False)
-                p._exists = None
+                p._invalidate_cache(debug=debug)
                 return False, msg
             ### Fetch the dataframe from the connector's `fetch()` method.
@@ -289,7 +290,7 @@ def sync(
                 df = None
             if df is None:
-                p._exists = None
+                p._invalidate_cache(debug=debug)
                 return False, f"No data were fetched for {p}."
             if isinstance(df, list):
@@ -303,7 +304,7 @@ def sync(
                     return success, message
             if df is True:
-                p._exists = None
+                p._invalidate_cache(debug=debug)
                 return True, f"{p} is being synced in parallel."
         ### CHECKPOINT: Retrieved the DataFrame.
@@ -347,7 +348,7 @@ def sync(
                             + f"(attempt {_chunk_attempts} / {_max_chunk_attempts}).\n"
                             + f"Sleeping for {_sleep_seconds} second"
                             + ('s' if _sleep_seconds != 1 else '')
-                            + ":\n{_chunk_msg}"
+                            + f":\n{_chunk_msg}"
                         ),
                         stack=False,
                     )
@@ -400,34 +401,45 @@ def sync(
             return success, msg
         ### Cast to a dataframe and ensure datatypes are what we expect.
-        df = self.enforce_dtypes(
+        dtypes = p.get_dtypes(debug=debug)
+        df = p.enforce_dtypes(
             df,
             chunksize=chunksize,
             enforce=enforce_dtypes,
+            dtypes=dtypes,
             debug=debug,
         )
         if p.autotime:
-            dt_col = p.columns.get('datetime', 'ts')
-            dt_typ = p.dtypes.get(dt_col, 'datetime') if dt_col else 'datetime'
-            if dt_col and hasattr(df, 'columns') and dt_col not in df.columns:
-                now = datetime.now(timezone.utc)
-                now_val = (
-                    int(now.timestamp() * 1000)
-                    if are_dtypes_equal(dt_typ, 'int')
-                    else now
+            dt_col = p.columns.get('datetime', None)
+            ts_col = dt_col or mrsm.get_config(
+                'pipes', 'autotime', 'column_name_if_datetime_missing'
+            )
+            ts_typ = dtypes.get(ts_col, 'datetime') if ts_col else 'datetime'
+            if ts_col and hasattr(df, 'columns') and ts_col not in df.columns:
+                precision = p.get_precision(debug=debug)
+                now = get_current_timestamp(
+                    precision_unit=precision.get(
+                        'unit',
+                        STATIC_CONFIG['dtypes']['datetime']['default_precision_unit']
+                    ),
+                    precision_interval=precision.get('interval', 1),
+                    round_to=(precision.get('round_to', 'down')),
+                    as_int=(are_dtypes_equal(ts_typ, 'int')),
                 )
                 if debug:
-                    dprint(f"Adding current timestamp to dataframe synced to {p}: {now_val}")
+                    dprint(f"Adding current timestamp to dataframe synced to {p}: {now}")
-                df[dt_col] = now_val
-                kw['check_existing'] = False
+                df[ts_col] = now
+                kw['check_existing'] = dt_col is not None
-        ### Capture `numeric`, `uuid`, `json`, and `bytes` columns.
-        self._persist_new_json_columns(df, debug=debug)
-        self._persist_new_numeric_columns(df, debug=debug)
-        self._persist_new_uuid_columns(df, debug=debug)
-        self._persist_new_bytes_columns(df, debug=debug)
-        self._persist_new_geometry_columns(df, debug=debug)
+        ### Capture special columns.
+        capture_success, capture_msg = self._persist_new_special_columns(
+            df,
+            dtypes=dtypes,
+            debug=debug,
+        )
+        if not capture_success:
+            warn(f"Failed to capture new special columns for {self}:\n{capture_msg}")
         if debug:
             dprint(
@@ -467,18 +479,17 @@ def sync(
         ### CHECKPOINT: Finished syncing. Handle caching.
         _checkpoint(**kw)
-        if self.cache_pipe is not None:
+        if p.cache_pipe is not None:
             if debug:
                 dprint("Caching retrieved dataframe.", **kw)
-            _sync_cache_tuple = self.cache_pipe.sync(df, debug=debug, **kw)
+            _sync_cache_tuple = p.cache_pipe.sync(df, debug=debug, **kw)
             if not _sync_cache_tuple[0]:
                 warn(f"Failed to sync local cache for {self}.")
-        self._exists = None
+        p._invalidate_cache(debug=debug)
         return return_tuple
     if blocking:
-        self._exists = None
         return _sync(self, df=df)
     from meerschaum.utils.threading import Thread
@@ -503,10 +514,10 @@ def sync(
         )
         thread.start()
     except Exception as e:
-        self._exists = None
+        self._invalidate_cache(debug=debug)
         return False, str(e)
-    self._exists = None
+    self._invalidate_cache(debug=debug)
     return True, f"Spawned asyncronous sync for {self}."
@@ -552,7 +563,8 @@ def get_sync_time(
     """
     from meerschaum.utils.venv import Venv
     from meerschaum.connectors import get_connector_plugin
-    from meerschaum.utils.misc import round_time, filter_keywords
+    from meerschaum.utils.misc import filter_keywords
+    from meerschaum.utils.dtypes import round_time
     from meerschaum.utils.warnings import warn
     if not self.columns.get('datetime', None):
@@ -611,17 +623,16 @@ def exists(
     import time
     from meerschaum.utils.venv import Venv
     from meerschaum.connectors import get_connector_plugin
-    from meerschaum.config import STATIC_CONFIG
     from meerschaum.utils.debug import dprint
     now = time.perf_counter()
-    exists_timeout_seconds = STATIC_CONFIG['pipes']['exists_timeout_seconds']
+    cache_seconds = mrsm.get_config('pipes', 'sync', 'exists_cache_seconds')
     _exists = self.__dict__.get('_exists', None)
     if _exists:
         exists_timestamp = self.__dict__.get('_exists_timestamp', None)
         if exists_timestamp is not None:
             delta = now - exists_timestamp
-            if delta < exists_timeout_seconds:
+            if delta < cache_seconds:
                 if debug:
                     dprint(f"Returning cached `exists` for {self} ({round(delta, 2)} seconds old).")
                 return _exists
@@ -686,7 +697,6 @@ def filter_existing(
     from meerschaum.utils.warnings import warn
     from meerschaum.utils.debug import dprint
     from meerschaum.utils.packages import attempt_import, import_pandas
-    from meerschaum.utils.misc import round_time
     from meerschaum.utils.dataframe import (
         filter_unseen_df,
         add_missing_cols_to_df,
@@ -698,6 +708,7 @@ def filter_existing(
         to_datetime,
         are_dtypes_equal,
         value_is_null,
+        round_time,
     )
     from meerschaum.config import get_config
     pd = import_pandas()
@@ -713,11 +724,13 @@ def filter_existing(
         merge = pd.merge
         NA = pd.NA
-    primary_key = self.columns.get('primary', None)
-    dt_col = self.columns.get('datetime', None)
-    autoincrement = self.parameters.get('autoincrement', False)
-    autotime = self.parameters.get('autotime', False)
-    pipe_columns = self.columns.copy()
+    parameters = self.parameters
+    pipe_columns = parameters.get('columns', {})
+    primary_key = pipe_columns.get('primary', None)
+    dt_col = pipe_columns.get('datetime', None)
+    dt_type = parameters.get('dtypes', {}).get(dt_col, 'datetime') if dt_col else None
+    autoincrement = parameters.get('autoincrement', False)
+    autotime = parameters.get('autotime', False)
     if primary_key and autoincrement and df is not None and primary_key in df.columns:
         if safe_copy:
@@ -738,7 +751,7 @@ def filter_existing(
     def get_empty_df():
         empty_df = pd.DataFrame([])
         dtypes = dict(df.dtypes) if df is not None else {}
-        dtypes.update(self.dtypes)
+        dtypes.update(self.dtypes) if self.enforce else {}
         pd_dtypes = {
             col: to_pandas_dtype(str(typ))
             for col, typ in dtypes.items()
@@ -754,9 +767,6 @@ def filter_existing(
     ### begin is the oldest data in the new dataframe
     begin, end = None, None
-    dt_col = pipe_columns.get('datetime', None)
-    primary_key = pipe_columns.get('primary', None)
-    dt_type = self.dtypes.get(dt_col, 'datetime64[ns, UTC]') if dt_col else None
     if autoincrement and primary_key == dt_col and dt_col not in df.columns:
         if enforce_dtypes:
@@ -884,7 +894,8 @@ def filter_existing(
             and col in backtrack_df.columns
         )
     ] if not primary_key else [primary_key]
-    self_dtypes = self.dtypes
+    self_dtypes = self.get_dtypes(debug=debug) if self.enforce else {}
     on_cols_dtypes = {
         col: to_pandas_dtype(typ)
         for col, typ in self_dtypes.items()
@@ -1037,117 +1048,36 @@ def get_num_workers(self, workers: Optional[int] = None) -> int:
     )
-def _persist_new_numeric_columns(self, df, debug: bool = False) -> SuccessTuple:
-    """
-    Check for new numeric columns and update the parameters.
-    """
-    from meerschaum.utils.dataframe import get_numeric_cols
-    numeric_cols = get_numeric_cols(df)
-    existing_numeric_cols = [col for col, typ in self.dtypes.items() if typ.startswith('numeric')]
-    new_numeric_cols = [col for col in numeric_cols if col not in existing_numeric_cols]
-    if not new_numeric_cols:
-        return True, "Success"
-    self._attributes_sync_time = None
-    dtypes = self.parameters.get('dtypes', {})
-    dtypes.update({col: 'numeric' for col in new_numeric_cols})
-    return self.update_parameters({'dtypes': dtypes}, debug=debug)
-def _persist_new_uuid_columns(self, df, debug: bool = False) -> SuccessTuple:
-    """
-    Check for new numeric columns and update the parameters.
-    """
-    from meerschaum.utils.dataframe import get_uuid_cols
-    uuid_cols = get_uuid_cols(df)
-    existing_uuid_cols = [col for col, typ in self.dtypes.items() if typ == 'uuid']
-    new_uuid_cols = [col for col in uuid_cols if col not in existing_uuid_cols]
-    if not new_uuid_cols:
-        return True, "Success"
-    self._attributes_sync_time = None
-    dtypes = self.parameters.get('dtypes', {})
-    dtypes.update({col: 'uuid' for col in new_uuid_cols})
-    return self.update_parameters({'dtypes': dtypes}, debug=debug)
-def _persist_new_json_columns(self, df, debug: bool = False) -> SuccessTuple:
-    """
-    Check for new JSON columns and update the parameters.
-    """
-    from meerschaum.utils.dataframe import get_json_cols
-    json_cols = get_json_cols(df)
-    existing_json_cols = [col for col, typ in self.dtypes.items() if typ == 'json']
-    new_json_cols = [col for col in json_cols if col not in existing_json_cols]
-    if not new_json_cols:
-        return True, "Success"
-    self._attributes_sync_time = None
-    dtypes = self.parameters.get('dtypes', {})
-    dtypes.update({col: 'json' for col in new_json_cols})
-    return self.update_parameters({'dtypes': dtypes}, debug=debug)
-def _persist_new_bytes_columns(self, df, debug: bool = False) -> SuccessTuple:
+def _persist_new_special_columns(
+    self,
+    df: 'pd.DataFrame',
+    dtypes: Optional[Dict[str, str]] = None,
+    debug: bool = False,
+) -> mrsm.SuccessTuple:
     """
-    Check for new `bytes` columns and update the parameters.
+    Check for new special columns and update the parameters accordingly.
     """
-    from meerschaum.utils.dataframe import get_bytes_cols
-    bytes_cols = get_bytes_cols(df)
-    existing_bytes_cols = [col for col, typ in self.dtypes.items() if typ == 'bytes']
-    new_bytes_cols = [col for col in bytes_cols if col not in existing_bytes_cols]
-    if not new_bytes_cols:
+    from meerschaum.utils.dataframe import get_special_cols
+    from meerschaum.utils.dtypes import dtype_is_special
+    from meerschaum.utils.warnings import dprint
+    special_cols = get_special_cols(df)
+    dtypes = dtypes or self.get_dtypes(debug=debug)
+    existing_special_cols = {
+        col: typ
+        for col, typ in dtypes.items()
+        if dtype_is_special(typ)
+    }
+    new_special_cols = {
+        col: typ
+        for col, typ in special_cols.items()
+        if col not in existing_special_cols
+    }
+    if not new_special_cols:
         return True, "Success"
-    self._attributes_sync_time = None
-    dtypes = self.parameters.get('dtypes', {})
-    dtypes.update({col: 'bytes' for col in new_bytes_cols})
-    return self.update_parameters({'dtypes': dtypes}, debug=debug)
-def _persist_new_geometry_columns(self, df, debug: bool = False) -> SuccessTuple:
-    """
-    Check for new `geometry` columns and update the parameters.
-    """
-    from meerschaum.utils.dataframe import get_geometry_cols
-    geometry_cols_types_srids = get_geometry_cols(df, with_types_srids=True)
-    existing_geometry_cols = [
-        col
-        for col, typ in self.dtypes.items()
-        if typ.startswith('geometry') or typ.startswith('geography')
-    ]
-    new_geometry_cols = [
-        col
-        for col in geometry_cols_types_srids
-        if col not in existing_geometry_cols
-    ]
-    if not new_geometry_cols:
-        return True, "Success"
+    if debug:
+        dprint(f"New special columns:\n{new_special_cols}")
     self._attributes_sync_time = None
-    dtypes = self.parameters.get('dtypes', {})
-    new_cols_types = {}
-    for col, (geometry_type, srid) in geometry_cols_types_srids.items():
-        if col not in new_geometry_cols:
-            continue
-        new_dtype = "geometry"
-        modifier = ""
-        if not srid and geometry_type.lower() == 'geometry':
-            new_cols_types[col] = new_dtype
-            continue
-        modifier = "["
-        if geometry_type.lower() != 'geometry':
-            modifier += f"{geometry_type}"
-        if srid:
-            if modifier != '[':
-                modifier += ", "
-            modifier += f"{srid}"
-        modifier += "]"
-        new_cols_types[col] = f"{new_dtype}{modifier}"
-    dtypes.update(new_cols_types)
-    return self.update_parameters({'dtypes': dtypes})
+    return self.update_parameters({'dtypes': new_special_cols}, debug=debug)

meerschaum/core/Pipe/_verify.py CHANGED Viewed

@@ -418,7 +418,7 @@ def verify(
             retry_failed_batch = False
         batch_msg_to_print = (
-            f"{make_header('Completed batch ' + batch_counter_str + ':')}\n{batch_msg}"
+            f"{make_header('Completed batch ' + batch_counter_str + ':', left_pad=0)}\n{batch_msg}"
         )
         mrsm.pprint((batch_success, batch_msg_to_print))
@@ -426,7 +426,7 @@ def verify(
             info(f"Retrying batch {batch_counter_str}...")
             retry_batch_success, retry_batch_msg = process_batch(batch)
             retry_batch_msg_to_print = (
-                f"Retried {make_header('batch ' + batch_label)}\n{retry_batch_msg}"
+                f"Retried {make_header('batch ' + batch_label, left_pad=0)}\n{retry_batch_msg}"
             )
             mrsm.pprint((retry_batch_success, retry_batch_msg_to_print))
@@ -587,7 +587,7 @@ def get_bound_interval(self, debug: bool = False) -> Union[timedelta, int, None]
     if not dt_col:
         return bound_time_value
-    dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
+    dt_typ = self.dtypes.get(dt_col, 'datetime')
     if 'int' in dt_typ.lower():
         return int(bound_time_value)

meerschaum/core/Token/_Token.py CHANGED Viewed

@@ -14,7 +14,6 @@ from typing import Optional, Union, List, Tuple
 from datetime import datetime, timedelta, timezone
 import meerschaum as mrsm
-from meerschaum.models import TokenModel
 _PLACEHOLDER_EXPIRATION = datetime(2000, 1, 1)
@@ -38,9 +37,8 @@ class Token:
         secret: Optional[str] = None,
         secret_hash: Optional[str] = None,
     ):
-        from meerschaum.utils.dtypes import coerce_timezone
+        from meerschaum.utils.dtypes import coerce_timezone, round_time
         from meerschaum.utils.daemon import get_new_daemon_name
-        from meerschaum.utils.misc import round_time
         from meerschaum._internal.static import STATIC_CONFIG
         now = datetime.now(timezone.utc)
         default_expiration_days = mrsm.get_config(
@@ -153,10 +151,11 @@ class Token:
             return False
         return self.instance_connector.token_exists(self.id, debug=debug)
-    def to_model(self, refresh: bool = False, debug: bool = False) -> TokenModel:
+    def to_model(self, refresh: bool = False, debug: bool = False) -> 'TokenModel':
         """
         Export the current state to a `TokenModel`.
         """
+        from meerschaum.models import TokenModel
         in_memory_doc = {
             'id': self.id,
             'label': self.label,

meerschaum 3.0.0rc1__py3-none-any.whl → 3.0.0rc2__py3-none-any.whl

meerschaum 3.0.0rc1py3-none-any.whl → 3.0.0rc2py3-none-any.whl