meerschaum 3.0.0rc1__py3-none-any.whl → 3.0.0rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/_internal/arguments/_parser.py +2 -1
- meerschaum/_internal/docs/index.py +49 -2
- meerschaum/_internal/shell/Shell.py +5 -4
- meerschaum/_internal/static.py +8 -24
- meerschaum/actions/bootstrap.py +1 -1
- meerschaum/actions/edit.py +6 -3
- meerschaum/actions/start.py +1 -1
- meerschaum/actions/verify.py +5 -8
- meerschaum/api/__init__.py +2 -1
- meerschaum/api/dash/__init__.py +0 -2
- meerschaum/api/dash/callbacks/__init__.py +1 -0
- meerschaum/api/dash/callbacks/dashboard.py +20 -19
- meerschaum/api/dash/callbacks/jobs.py +11 -5
- meerschaum/api/dash/callbacks/pipes.py +106 -5
- meerschaum/api/dash/callbacks/settings/__init__.py +0 -1
- meerschaum/api/dash/callbacks/{settings/tokens.py → tokens.py} +1 -1
- meerschaum/api/dash/jobs.py +1 -1
- meerschaum/api/dash/pages/__init__.py +2 -1
- meerschaum/api/dash/pages/{job.py → jobs.py} +10 -7
- meerschaum/api/dash/pages/pipes.py +4 -3
- meerschaum/api/dash/pages/settings/__init__.py +0 -1
- meerschaum/api/dash/pages/{settings/tokens.py → tokens.py} +6 -8
- meerschaum/api/dash/pipes.py +131 -0
- meerschaum/api/dash/tokens.py +28 -31
- meerschaum/api/routes/_pipes.py +47 -37
- meerschaum/config/_default.py +13 -2
- meerschaum/config/_paths.py +1 -0
- meerschaum/config/_version.py +1 -1
- meerschaum/config/stack/__init__.py +9 -8
- meerschaum/connectors/api/_pipes.py +2 -18
- meerschaum/connectors/api/_tokens.py +2 -2
- meerschaum/connectors/instance/_tokens.py +10 -6
- meerschaum/connectors/sql/_SQLConnector.py +14 -0
- meerschaum/connectors/sql/_create_engine.py +3 -14
- meerschaum/connectors/sql/_pipes.py +175 -185
- meerschaum/connectors/sql/_sql.py +38 -20
- meerschaum/connectors/sql/tables/__init__.py +237 -122
- meerschaum/connectors/valkey/_pipes.py +44 -16
- meerschaum/core/Pipe/__init__.py +28 -5
- meerschaum/core/Pipe/_attributes.py +273 -46
- meerschaum/core/Pipe/_data.py +55 -17
- meerschaum/core/Pipe/_dtypes.py +19 -4
- meerschaum/core/Pipe/_edit.py +2 -0
- meerschaum/core/Pipe/_fetch.py +1 -1
- meerschaum/core/Pipe/_sync.py +90 -160
- meerschaum/core/Pipe/_verify.py +3 -3
- meerschaum/core/Token/_Token.py +4 -5
- meerschaum/plugins/bootstrap.py +508 -3
- meerschaum/utils/_get_pipes.py +1 -1
- meerschaum/utils/dataframe.py +385 -68
- meerschaum/utils/debug.py +15 -15
- meerschaum/utils/dtypes/__init__.py +387 -22
- meerschaum/utils/dtypes/sql.py +327 -31
- meerschaum/utils/misc.py +9 -68
- meerschaum/utils/packages/__init__.py +7 -21
- meerschaum/utils/packages/_packages.py +7 -2
- meerschaum/utils/schedule.py +1 -1
- meerschaum/utils/sql.py +8 -8
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/METADATA +5 -17
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/RECORD +66 -65
- meerschaum-3.0.0rc3.dist-info/licenses/NOTICE +2 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/WHEEL +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/entry_points.txt +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/licenses/LICENSE +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/top_level.txt +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/zip-safe +0 -0
meerschaum/utils/dataframe.py
CHANGED
@@ -8,14 +8,13 @@ Utility functions for working with DataFrames.
|
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
10
|
|
11
|
-
import
|
12
|
-
from datetime import datetime, timezone
|
11
|
+
from datetime import datetime, timezone, date
|
13
12
|
from collections import defaultdict
|
14
13
|
|
15
14
|
import meerschaum as mrsm
|
16
15
|
from meerschaum.utils.typing import (
|
17
16
|
Optional, Dict, Any, List, Hashable, Generator,
|
18
|
-
Iterator, Iterable, Union, TYPE_CHECKING,
|
17
|
+
Iterator, Iterable, Union, TYPE_CHECKING, Tuple,
|
19
18
|
)
|
20
19
|
|
21
20
|
if TYPE_CHECKING:
|
@@ -249,8 +248,10 @@ def filter_unseen_df(
|
|
249
248
|
cast_dt_cols = True
|
250
249
|
try:
|
251
250
|
for col, typ in dt_dtypes.items():
|
251
|
+
_dtypes_col_dtype = str((dtypes or {}).get(col, 'datetime'))
|
252
252
|
strip_utc = (
|
253
|
-
|
253
|
+
_dtypes_col_dtype.startswith('datetime64')
|
254
|
+
and 'utc' not in _dtypes_col_dtype.lower()
|
254
255
|
)
|
255
256
|
if col in old_df.columns:
|
256
257
|
old_df[col] = coerce_timezone(old_df[col], strip_utc=strip_utc)
|
@@ -327,8 +328,10 @@ def filter_unseen_df(
|
|
327
328
|
if are_dtypes_equal(str(typ), 'datetime')
|
328
329
|
]
|
329
330
|
for col in old_dt_cols:
|
331
|
+
_dtypes_col_dtype = str((dtypes or {}).get(col, 'datetime'))
|
330
332
|
strip_utc = (
|
331
|
-
|
333
|
+
_dtypes_col_dtype.startswith('datetime64')
|
334
|
+
and 'utc' not in _dtypes_col_dtype.lower()
|
332
335
|
)
|
333
336
|
old_df[col] = coerce_timezone(old_df[col], strip_utc=strip_utc)
|
334
337
|
|
@@ -338,8 +341,10 @@ def filter_unseen_df(
|
|
338
341
|
if are_dtypes_equal(str(typ), 'datetime')
|
339
342
|
]
|
340
343
|
for col in new_dt_cols:
|
344
|
+
_dtypes_col_dtype = str((dtypes or {}).get(col, 'datetime'))
|
341
345
|
strip_utc = (
|
342
|
-
|
346
|
+
_dtypes_col_dtype.startswith('datetime64')
|
347
|
+
and 'utc' not in _dtypes_col_dtype.lower()
|
343
348
|
)
|
344
349
|
new_df[col] = coerce_timezone(new_df[col], strip_utc=strip_utc)
|
345
350
|
|
@@ -423,6 +428,8 @@ def parse_df_datetimes(
|
|
423
428
|
chunksize: Optional[int] = None,
|
424
429
|
dtype_backend: str = 'numpy_nullable',
|
425
430
|
ignore_all: bool = False,
|
431
|
+
precision_unit: Optional[str] = None,
|
432
|
+
coerce_utc: bool = True,
|
426
433
|
debug: bool = False,
|
427
434
|
) -> 'pd.DataFrame':
|
428
435
|
"""
|
@@ -450,6 +457,12 @@ def parse_df_datetimes(
|
|
450
457
|
ignore_all: bool, default False
|
451
458
|
If `True`, do not attempt to cast any columns to datetimes.
|
452
459
|
|
460
|
+
precision_unit: Optional[str], default None
|
461
|
+
If provided, enforce the given precision on the coerced datetime columns.
|
462
|
+
|
463
|
+
coerce_utc: bool, default True
|
464
|
+
Coerce the datetime columns to UTC (see `meerschaum.utils.dtypes.to_datetime()`).
|
465
|
+
|
453
466
|
debug: bool, default False
|
454
467
|
Verbosity toggle.
|
455
468
|
|
@@ -466,9 +479,9 @@ def parse_df_datetimes(
|
|
466
479
|
>>> df.dtypes
|
467
480
|
a object
|
468
481
|
dtype: object
|
469
|
-
>>>
|
470
|
-
>>>
|
471
|
-
a datetime64[
|
482
|
+
>>> df2 = parse_df_datetimes(df)
|
483
|
+
>>> df2.dtypes
|
484
|
+
a datetime64[us, UTC]
|
472
485
|
dtype: object
|
473
486
|
|
474
487
|
```
|
@@ -478,8 +491,9 @@ def parse_df_datetimes(
|
|
478
491
|
from meerschaum.utils.debug import dprint
|
479
492
|
from meerschaum.utils.warnings import warn
|
480
493
|
from meerschaum.utils.misc import items_str
|
481
|
-
from meerschaum.utils.dtypes import to_datetime
|
494
|
+
from meerschaum.utils.dtypes import to_datetime, MRSM_PD_DTYPES
|
482
495
|
import traceback
|
496
|
+
|
483
497
|
pd = import_pandas()
|
484
498
|
pandas = attempt_import('pandas')
|
485
499
|
pd_name = pd.__name__
|
@@ -567,22 +581,25 @@ def parse_df_datetimes(
|
|
567
581
|
if debug:
|
568
582
|
dprint("Converting columns to datetimes: " + str(datetime_cols))
|
569
583
|
|
584
|
+
def _parse_to_datetime(x):
|
585
|
+
return to_datetime(x, precision_unit=precision_unit, coerce_utc=coerce_utc)
|
586
|
+
|
570
587
|
try:
|
571
588
|
if not using_dask:
|
572
|
-
df[datetime_cols] = df[datetime_cols].apply(
|
589
|
+
df[datetime_cols] = df[datetime_cols].apply(_parse_to_datetime)
|
573
590
|
else:
|
574
591
|
df[datetime_cols] = df[datetime_cols].apply(
|
575
|
-
|
592
|
+
_parse_to_datetime,
|
576
593
|
utc=True,
|
577
594
|
axis=1,
|
578
595
|
meta={
|
579
|
-
col: '
|
596
|
+
col: MRSM_PD_DTYPES['datetime']
|
580
597
|
for col in datetime_cols
|
581
598
|
}
|
582
599
|
)
|
583
600
|
except Exception:
|
584
601
|
warn(
|
585
|
-
f"Unable to apply `
|
602
|
+
f"Unable to apply `to_datetime()` to {items_str(datetime_cols)}:\n"
|
586
603
|
+ f"{traceback.format_exc()}"
|
587
604
|
)
|
588
605
|
|
@@ -660,8 +677,7 @@ def get_json_cols(df: 'pd.DataFrame') -> List[str]:
|
|
660
677
|
for col, ix in cols_indices.items()
|
661
678
|
if (
|
662
679
|
ix is not None
|
663
|
-
and
|
664
|
-
not isinstance(df.loc[ix][col], Hashable)
|
680
|
+
and isinstance(df.loc[ix][col], (dict, list))
|
665
681
|
)
|
666
682
|
]
|
667
683
|
|
@@ -704,6 +720,38 @@ def get_numeric_cols(df: 'pd.DataFrame') -> List[str]:
|
|
704
720
|
]
|
705
721
|
|
706
722
|
|
723
|
+
def get_bool_cols(df: 'pd.DataFrame') -> List[str]:
|
724
|
+
"""
|
725
|
+
Get the columns which contain `bool` objects from a Pandas DataFrame.
|
726
|
+
|
727
|
+
Parameters
|
728
|
+
----------
|
729
|
+
df: pd.DataFrame
|
730
|
+
The DataFrame which may contain bools.
|
731
|
+
|
732
|
+
Returns
|
733
|
+
-------
|
734
|
+
A list of columns to treat as bools.
|
735
|
+
"""
|
736
|
+
if df is None:
|
737
|
+
return []
|
738
|
+
|
739
|
+
is_dask = 'dask' in df.__module__
|
740
|
+
if is_dask:
|
741
|
+
df = get_first_valid_dask_partition(df)
|
742
|
+
|
743
|
+
if len(df) == 0:
|
744
|
+
return []
|
745
|
+
|
746
|
+
from meerschaum.utils.dtypes import are_dtypes_equal
|
747
|
+
|
748
|
+
return [
|
749
|
+
col
|
750
|
+
for col, typ in df.dtypes.items()
|
751
|
+
if are_dtypes_equal(str(typ), 'bool')
|
752
|
+
]
|
753
|
+
|
754
|
+
|
707
755
|
def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
|
708
756
|
"""
|
709
757
|
Get the columns which contain `uuid.UUID` objects from a Pandas DataFrame.
|
@@ -746,7 +794,8 @@ def get_datetime_cols(
|
|
746
794
|
df: 'pd.DataFrame',
|
747
795
|
timezone_aware: bool = True,
|
748
796
|
timezone_naive: bool = True,
|
749
|
-
|
797
|
+
with_tz_precision: bool = False,
|
798
|
+
) -> Union[List[str], Dict[str, Tuple[Union[str, None], str]]]:
|
750
799
|
"""
|
751
800
|
Get the columns which contain `datetime` or `Timestamp` objects from a Pandas DataFrame.
|
752
801
|
|
@@ -761,76 +810,233 @@ def get_datetime_cols(
|
|
761
810
|
timezone_naive: bool, default True
|
762
811
|
If `True`, include timezone-naive datetime columns.
|
763
812
|
|
813
|
+
with_tz_precision: bool, default False
|
814
|
+
If `True`, return a dictionary mapping column names to tuples in the form
|
815
|
+
`(timezone, precision)`.
|
816
|
+
|
764
817
|
Returns
|
765
818
|
-------
|
766
|
-
A list of columns to treat as datetimes
|
819
|
+
A list of columns to treat as datetimes, or a dictionary of columns to tz+precision tuples
|
820
|
+
(if `with_tz_precision` is `True`).
|
767
821
|
"""
|
768
822
|
if not timezone_aware and not timezone_naive:
|
769
823
|
raise ValueError("`timezone_aware` and `timezone_naive` cannot both be `False`.")
|
770
824
|
|
771
825
|
if df is None:
|
772
|
-
return []
|
826
|
+
return [] if not with_tz_precision else {}
|
773
827
|
|
774
828
|
from datetime import datetime
|
775
|
-
from meerschaum.utils.dtypes import are_dtypes_equal
|
829
|
+
from meerschaum.utils.dtypes import are_dtypes_equal, MRSM_PRECISION_UNITS_ALIASES
|
776
830
|
is_dask = 'dask' in df.__module__
|
777
831
|
if is_dask:
|
778
832
|
df = get_first_valid_dask_partition(df)
|
833
|
+
|
834
|
+
def get_tz_precision_from_dtype(dtype: str) -> Tuple[Union[str, None], str]:
|
835
|
+
"""
|
836
|
+
Extract the tz + precision tuple from a dtype string.
|
837
|
+
"""
|
838
|
+
meta_str = dtype.split('[', maxsplit=1)[-1].rstrip(']').replace(' ', '')
|
839
|
+
tz = (
|
840
|
+
None
|
841
|
+
if ',' not in meta_str
|
842
|
+
else meta_str.split(',', maxsplit=1)[-1]
|
843
|
+
)
|
844
|
+
precision_abbreviation = (
|
845
|
+
meta_str
|
846
|
+
if ',' not in meta_str
|
847
|
+
else meta_str.split(',')[0]
|
848
|
+
)
|
849
|
+
precision = MRSM_PRECISION_UNITS_ALIASES[precision_abbreviation]
|
850
|
+
return tz, precision
|
779
851
|
|
780
|
-
|
781
|
-
|
852
|
+
def get_tz_precision_from_datetime(dt: datetime) -> Tuple[Union[str, None], str]:
|
853
|
+
"""
|
854
|
+
Return the tz + precision tuple from a Python datetime object.
|
855
|
+
"""
|
856
|
+
return dt.tzname(), 'microsecond'
|
857
|
+
|
858
|
+
known_dt_cols_types = {
|
859
|
+
col: str(typ)
|
782
860
|
for col, typ in df.dtypes.items()
|
783
861
|
if are_dtypes_equal('datetime', str(typ))
|
784
|
-
|
862
|
+
}
|
863
|
+
|
864
|
+
known_dt_cols_tuples = {
|
865
|
+
col: get_tz_precision_from_dtype(typ)
|
866
|
+
for col, typ in known_dt_cols_types.items()
|
867
|
+
}
|
785
868
|
|
786
869
|
if len(df) == 0:
|
787
|
-
return
|
870
|
+
return (
|
871
|
+
list(known_dt_cols_types)
|
872
|
+
if not with_tz_precision
|
873
|
+
else known_dt_cols_tuples
|
874
|
+
)
|
788
875
|
|
789
876
|
cols_indices = {
|
790
877
|
col: df[col].first_valid_index()
|
791
878
|
for col in df.columns
|
792
|
-
if col not in
|
879
|
+
if col not in known_dt_cols_types
|
793
880
|
}
|
794
|
-
|
795
|
-
col
|
881
|
+
pydt_cols_tuples = {
|
882
|
+
col: get_tz_precision_from_datetime(sample_val)
|
796
883
|
for col, ix in cols_indices.items()
|
797
884
|
if (
|
798
885
|
ix is not None
|
799
886
|
and
|
800
|
-
isinstance(df.loc[ix][col], datetime)
|
887
|
+
isinstance((sample_val := df.loc[ix][col]), datetime)
|
801
888
|
)
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
889
|
+
}
|
890
|
+
|
891
|
+
dt_cols_tuples = {
|
892
|
+
**known_dt_cols_tuples,
|
893
|
+
**pydt_cols_tuples
|
894
|
+
}
|
895
|
+
|
896
|
+
all_dt_cols_tuples = {
|
897
|
+
col: dt_cols_tuples[col]
|
806
898
|
for col in df.columns
|
807
|
-
if col in
|
808
|
-
|
899
|
+
if col in dt_cols_tuples
|
900
|
+
}
|
809
901
|
if timezone_aware and timezone_naive:
|
810
|
-
return
|
902
|
+
return (
|
903
|
+
list(all_dt_cols_tuples)
|
904
|
+
if not with_tz_precision
|
905
|
+
else all_dt_cols_tuples
|
906
|
+
)
|
811
907
|
|
812
908
|
known_timezone_aware_dt_cols = [
|
813
909
|
col
|
814
|
-
for col in
|
910
|
+
for col in known_dt_cols_types
|
815
911
|
if getattr(df[col], 'tz', None) is not None
|
816
912
|
]
|
817
|
-
|
818
|
-
col
|
819
|
-
for col in
|
913
|
+
timezone_aware_pydt_cols_tuples = {
|
914
|
+
col: (tz, precision)
|
915
|
+
for col, (tz, precision) in pydt_cols_tuples.items()
|
820
916
|
if df.loc[cols_indices[col]][col].tzinfo is not None
|
821
|
-
|
822
|
-
timezone_aware_dt_cols_set = set(
|
917
|
+
}
|
918
|
+
timezone_aware_dt_cols_set = set(
|
919
|
+
known_timezone_aware_dt_cols + list(timezone_aware_pydt_cols_tuples)
|
920
|
+
)
|
921
|
+
timezone_aware_cols_tuples = {
|
922
|
+
col: (tz, precision)
|
923
|
+
for col, (tz, precision) in all_dt_cols_tuples.items()
|
924
|
+
if col in timezone_aware_dt_cols_set
|
925
|
+
}
|
926
|
+
timezone_naive_cols_tuples = {
|
927
|
+
col: (tz, precision)
|
928
|
+
for col, (tz, precision) in all_dt_cols_tuples.items()
|
929
|
+
if col not in timezone_aware_dt_cols_set
|
930
|
+
}
|
931
|
+
|
823
932
|
if timezone_aware:
|
824
|
-
return
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
933
|
+
return (
|
934
|
+
list(timezone_aware_cols_tuples)
|
935
|
+
if not with_tz_precision
|
936
|
+
else timezone_aware_cols_tuples
|
937
|
+
)
|
938
|
+
|
939
|
+
return (
|
940
|
+
list(timezone_naive_cols_tuples)
|
941
|
+
if not with_tz_precision
|
942
|
+
else timezone_naive_cols_tuples
|
943
|
+
)
|
944
|
+
|
945
|
+
|
946
|
+
def get_datetime_cols_types(df: 'pd.DataFrame') -> Dict[str, str]:
|
947
|
+
"""
|
948
|
+
Return a dictionary mapping datetime columns to specific types strings.
|
949
|
+
|
950
|
+
Parameters
|
951
|
+
----------
|
952
|
+
df: pd.DataFrame
|
953
|
+
The DataFrame which may contain datetime columns.
|
954
|
+
|
955
|
+
Returns
|
956
|
+
-------
|
957
|
+
A dictionary mapping the datetime columns' names to dtype strings
|
958
|
+
(containing timezone and precision metadata).
|
959
|
+
|
960
|
+
Examples
|
961
|
+
--------
|
962
|
+
>>> from datetime import datetime, timezone
|
963
|
+
>>> import pandas as pd
|
964
|
+
>>> df = pd.DataFrame({'dt_tz_aware': [datetime(2025, 1, 1, tzinfo=timezone.utc)]})
|
965
|
+
>>> get_datetime_cols_types(df)
|
966
|
+
{'dt_tz_aware': 'datetime64[us, UTC]'}
|
967
|
+
>>> df = pd.DataFrame({'distant_dt': [datetime(1, 1, 1)]})
|
968
|
+
>>> get_datetime_cols_types(df)
|
969
|
+
{'distant_dt': 'datetime64[us]'}
|
970
|
+
>>> df = pd.DataFrame({'dt_second': datetime(2025, 1, 1)})
|
971
|
+
>>> df['dt_second'] = df['dt_second'].astype('datetime64[s]')
|
972
|
+
>>> get_datetime_cols_types(df)
|
973
|
+
{'dt_second': 'datetime64[s]'}
|
974
|
+
"""
|
975
|
+
from meerschaum.utils.dtypes import MRSM_PRECISION_UNITS_ABBREVIATIONS
|
976
|
+
dt_cols_tuples = get_datetime_cols(df, with_tz_precision=True)
|
977
|
+
if not dt_cols_tuples:
|
978
|
+
return {}
|
979
|
+
|
980
|
+
return {
|
981
|
+
col: (
|
982
|
+
f"datetime64[{MRSM_PRECISION_UNITS_ABBREVIATIONS[precision]}]"
|
983
|
+
if tz is None
|
984
|
+
else f"datetime64[{MRSM_PRECISION_UNITS_ABBREVIATIONS[precision]}, {tz}]"
|
985
|
+
)
|
986
|
+
for col, (tz, precision) in dt_cols_tuples.items()
|
987
|
+
}
|
988
|
+
|
989
|
+
|
990
|
+
def get_date_cols(df: 'pd.DataFrame') -> List[str]:
|
991
|
+
"""
|
992
|
+
Get the `date` columns from a Pandas DataFrame.
|
993
|
+
|
994
|
+
Parameters
|
995
|
+
----------
|
996
|
+
df: pd.DataFrame
|
997
|
+
The DataFrame which may contain dates.
|
998
|
+
|
999
|
+
Returns
|
1000
|
+
-------
|
1001
|
+
A list of columns to treat as dates.
|
1002
|
+
"""
|
1003
|
+
from meerschaum.utils.dtypes import are_dtypes_equal
|
1004
|
+
if df is None:
|
1005
|
+
return []
|
1006
|
+
|
1007
|
+
is_dask = 'dask' in df.__module__
|
1008
|
+
if is_dask:
|
1009
|
+
df = get_first_valid_dask_partition(df)
|
1010
|
+
|
1011
|
+
known_date_cols = [
|
1012
|
+
col
|
1013
|
+
for col, typ in df.dtypes.items()
|
1014
|
+
if are_dtypes_equal(typ, 'date')
|
1015
|
+
]
|
1016
|
+
|
1017
|
+
if len(df) == 0:
|
1018
|
+
return known_date_cols
|
1019
|
+
|
1020
|
+
cols_indices = {
|
1021
|
+
col: df[col].first_valid_index()
|
1022
|
+
for col in df.columns
|
1023
|
+
if col not in known_date_cols
|
1024
|
+
}
|
1025
|
+
object_date_cols = [
|
1026
|
+
col
|
1027
|
+
for col, ix in cols_indices.items()
|
1028
|
+
if (
|
1029
|
+
ix is not None
|
1030
|
+
and isinstance(df.loc[ix][col], date)
|
1031
|
+
)
|
1032
|
+
]
|
1033
|
+
|
1034
|
+
all_date_cols = set(known_date_cols + object_date_cols)
|
829
1035
|
|
830
1036
|
return [
|
831
1037
|
col
|
832
|
-
for col in
|
833
|
-
if col
|
1038
|
+
for col in df.columns
|
1039
|
+
if col in all_date_cols
|
834
1040
|
]
|
835
1041
|
|
836
1042
|
|
@@ -849,27 +1055,42 @@ def get_bytes_cols(df: 'pd.DataFrame') -> List[str]:
|
|
849
1055
|
"""
|
850
1056
|
if df is None:
|
851
1057
|
return []
|
1058
|
+
|
852
1059
|
is_dask = 'dask' in df.__module__
|
853
1060
|
if is_dask:
|
854
1061
|
df = get_first_valid_dask_partition(df)
|
855
1062
|
|
1063
|
+
known_bytes_cols = [
|
1064
|
+
col
|
1065
|
+
for col, typ in df.dtypes.items()
|
1066
|
+
if str(typ) == 'binary[pyarrow]'
|
1067
|
+
]
|
1068
|
+
|
856
1069
|
if len(df) == 0:
|
857
|
-
return
|
1070
|
+
return known_bytes_cols
|
858
1071
|
|
859
1072
|
cols_indices = {
|
860
1073
|
col: df[col].first_valid_index()
|
861
1074
|
for col in df.columns
|
1075
|
+
if col not in known_bytes_cols
|
862
1076
|
}
|
863
|
-
|
1077
|
+
object_bytes_cols = [
|
864
1078
|
col
|
865
1079
|
for col, ix in cols_indices.items()
|
866
1080
|
if (
|
867
1081
|
ix is not None
|
868
|
-
and
|
869
|
-
isinstance(df.loc[ix][col], bytes)
|
1082
|
+
and isinstance(df.loc[ix][col], bytes)
|
870
1083
|
)
|
871
1084
|
]
|
872
1085
|
|
1086
|
+
all_bytes_cols = set(known_bytes_cols + object_bytes_cols)
|
1087
|
+
|
1088
|
+
return [
|
1089
|
+
col
|
1090
|
+
for col in df.columns
|
1091
|
+
if col in all_bytes_cols
|
1092
|
+
]
|
1093
|
+
|
873
1094
|
|
874
1095
|
def get_geometry_cols(
|
875
1096
|
df: 'pd.DataFrame',
|
@@ -892,14 +1113,14 @@ def get_geometry_cols(
|
|
892
1113
|
If `with_types_srids`, return a dictionary mapping columns to tuples in the form (type, SRID).
|
893
1114
|
"""
|
894
1115
|
if df is None:
|
895
|
-
return []
|
1116
|
+
return [] if not with_types_srids else {}
|
896
1117
|
|
897
1118
|
is_dask = 'dask' in df.__module__
|
898
1119
|
if is_dask:
|
899
1120
|
df = get_first_valid_dask_partition(df)
|
900
1121
|
|
901
1122
|
if len(df) == 0:
|
902
|
-
return []
|
1123
|
+
return [] if not with_types_srids else {}
|
903
1124
|
|
904
1125
|
cols_indices = {
|
905
1126
|
col: df[col].first_valid_index()
|
@@ -948,11 +1169,54 @@ def get_geometry_cols(
|
|
948
1169
|
return geo_cols_types_srids
|
949
1170
|
|
950
1171
|
|
1172
|
+
def get_geometry_cols_types(df: 'pd.DataFrame') -> Dict[str, str]:
|
1173
|
+
"""
|
1174
|
+
Return a dtypes dictionary mapping columns to specific geometry types (type, srid).
|
1175
|
+
"""
|
1176
|
+
geometry_cols_types_srids = get_geometry_cols(df, with_types_srids=True)
|
1177
|
+
new_cols_types = {}
|
1178
|
+
for col, (geometry_type, srid) in geometry_cols_types_srids.items():
|
1179
|
+
new_dtype = "geometry"
|
1180
|
+
modifier = ""
|
1181
|
+
if not srid and geometry_type.lower() == 'geometry':
|
1182
|
+
new_cols_types[col] = new_dtype
|
1183
|
+
continue
|
1184
|
+
|
1185
|
+
modifier = "["
|
1186
|
+
if geometry_type.lower() != 'geometry':
|
1187
|
+
modifier += f"{geometry_type}"
|
1188
|
+
|
1189
|
+
if srid:
|
1190
|
+
if modifier != '[':
|
1191
|
+
modifier += ", "
|
1192
|
+
modifier += f"{srid}"
|
1193
|
+
modifier += "]"
|
1194
|
+
new_cols_types[col] = f"{new_dtype}{modifier}"
|
1195
|
+
return new_cols_types
|
1196
|
+
|
1197
|
+
|
1198
|
+
def get_special_cols(df: 'pd.DataFrame') -> Dict[str, str]:
|
1199
|
+
"""
|
1200
|
+
Return a dtypes dictionary mapping special columns to their dtypes.
|
1201
|
+
"""
|
1202
|
+
return {
|
1203
|
+
**{col: 'json' for col in get_json_cols(df)},
|
1204
|
+
**{col: 'uuid' for col in get_uuid_cols(df)},
|
1205
|
+
**{col: 'bytes' for col in get_bytes_cols(df)},
|
1206
|
+
**{col: 'bool' for col in get_bool_cols(df)},
|
1207
|
+
**{col: 'numeric' for col in get_numeric_cols(df)},
|
1208
|
+
**{col: 'date' for col in get_date_cols(df)},
|
1209
|
+
**get_datetime_cols_types(df),
|
1210
|
+
**get_geometry_cols_types(df),
|
1211
|
+
}
|
1212
|
+
|
1213
|
+
|
951
1214
|
def enforce_dtypes(
|
952
1215
|
df: 'pd.DataFrame',
|
953
1216
|
dtypes: Dict[str, str],
|
1217
|
+
explicit_dtypes: Optional[Dict[str, str]] = None,
|
954
1218
|
safe_copy: bool = True,
|
955
|
-
coerce_numeric: bool =
|
1219
|
+
coerce_numeric: bool = False,
|
956
1220
|
coerce_timezone: bool = True,
|
957
1221
|
strip_timezone: bool = False,
|
958
1222
|
debug: bool = False,
|
@@ -968,12 +1232,16 @@ def enforce_dtypes(
|
|
968
1232
|
dtypes: Dict[str, str]
|
969
1233
|
The data types to attempt to enforce on the DataFrame.
|
970
1234
|
|
1235
|
+
explicit_dtypes: Optional[Dict[str, str]], default None
|
1236
|
+
If provided, automatic dtype coersion will respect explicitly configured
|
1237
|
+
dtypes (`int`, `float`, `numeric`).
|
1238
|
+
|
971
1239
|
safe_copy: bool, default True
|
972
1240
|
If `True`, create a copy before comparing and modifying the dataframes.
|
973
1241
|
Setting to `False` may mutate the DataFrames.
|
974
1242
|
See `meerschaum.utils.dataframe.filter_unseen_df`.
|
975
1243
|
|
976
|
-
coerce_numeric: bool, default
|
1244
|
+
coerce_numeric: bool, default False
|
977
1245
|
If `True`, convert float and int collisions to numeric.
|
978
1246
|
|
979
1247
|
coerce_timezone: bool, default True
|
@@ -1015,6 +1283,7 @@ def enforce_dtypes(
|
|
1015
1283
|
dprint("Incoming DataFrame has no columns. Skipping enforcement...")
|
1016
1284
|
return df
|
1017
1285
|
|
1286
|
+
explicit_dtypes = explicit_dtypes or {}
|
1018
1287
|
pipe_pandas_dtypes = {
|
1019
1288
|
col: to_pandas_dtype(typ)
|
1020
1289
|
for col, typ in dtypes.items()
|
@@ -1121,11 +1390,24 @@ def enforce_dtypes(
|
|
1121
1390
|
dprint(f"Checking for datetime conversion: {datetime_cols}")
|
1122
1391
|
for col in datetime_cols:
|
1123
1392
|
if col in df.columns:
|
1393
|
+
if not strip_timezone and 'utc' in str(df.dtypes[col]).lower():
|
1394
|
+
if debug:
|
1395
|
+
dprint(f"Skip UTC coersion for column '{col}' ({str(df[col].dtype)}).")
|
1396
|
+
continue
|
1397
|
+
if strip_timezone and ',' not in str(df.dtypes[col]):
|
1398
|
+
if debug:
|
1399
|
+
dprint(
|
1400
|
+
f"Skip UTC coersion (stripped) for column '{col}' "
|
1401
|
+
f"({str(df[col].dtype)})."
|
1402
|
+
)
|
1403
|
+
continue
|
1404
|
+
|
1124
1405
|
if debug:
|
1125
1406
|
dprint(
|
1126
1407
|
f"Data type for column '{col}' before timezone coersion: "
|
1127
1408
|
f"{str(df[col].dtype)}"
|
1128
1409
|
)
|
1410
|
+
|
1129
1411
|
df[col] = _coerce_timezone(df[col], strip_utc=strip_timezone)
|
1130
1412
|
if debug:
|
1131
1413
|
dprint(
|
@@ -1206,13 +1488,45 @@ def enforce_dtypes(
|
|
1206
1488
|
for col, typ in {k: v for k, v in common_diff_dtypes.items()}.items():
|
1207
1489
|
previous_typ = common_dtypes[col]
|
1208
1490
|
mixed_numeric_types = (is_dtype_numeric(typ) and is_dtype_numeric(previous_typ))
|
1209
|
-
explicitly_float = are_dtypes_equal(
|
1210
|
-
|
1211
|
-
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1491
|
+
explicitly_float = are_dtypes_equal(explicit_dtypes.get(col, 'object'), 'float')
|
1492
|
+
explicitly_int = are_dtypes_equal(explicit_dtypes.get(col, 'object'), 'int')
|
1493
|
+
explicitly_numeric = explicit_dtypes.get(col, 'object').startswith('numeric')
|
1494
|
+
all_nan = (
|
1495
|
+
df[col].isnull().all()
|
1496
|
+
if mixed_numeric_types and coerce_numeric and not (explicitly_float or explicitly_int)
|
1497
|
+
else None
|
1498
|
+
)
|
1499
|
+
cast_to_numeric = explicitly_numeric or (
|
1500
|
+
(
|
1501
|
+
col in df_numeric_cols
|
1502
|
+
or (
|
1503
|
+
mixed_numeric_types
|
1504
|
+
and not (explicitly_float or explicitly_int)
|
1505
|
+
and not all_nan
|
1506
|
+
and coerce_numeric
|
1507
|
+
)
|
1508
|
+
)
|
1509
|
+
)
|
1510
|
+
|
1511
|
+
if debug and (explicitly_numeric or df_numeric_cols or mixed_numeric_types):
|
1512
|
+
from meerschaum.utils.formatting import make_header
|
1513
|
+
msg = (
|
1514
|
+
make_header(f"Coercing column '{col}' to numeric:", left_pad=0)
|
1515
|
+
+ "\n"
|
1516
|
+
+ f" Previous type: {previous_typ}\n"
|
1517
|
+
+ f" Current type: {typ if col not in df_numeric_cols else 'Decimal'}"
|
1518
|
+
+ ("\n Column is explicitly numeric." if explicitly_numeric else "")
|
1519
|
+
) if cast_to_numeric else (
|
1520
|
+
f"Will not coerce column '{col}' to numeric.\n"
|
1521
|
+
f" Numeric columns in dataframe: {df_numeric_cols}\n"
|
1522
|
+
f" Mixed numeric types: {mixed_numeric_types}\n"
|
1523
|
+
f" Explicitly float: {explicitly_float}\n"
|
1524
|
+
f" Explicitly int: {explicitly_int}\n"
|
1525
|
+
f" All NaN: {all_nan}\n"
|
1526
|
+
f" Coerce numeric: {coerce_numeric}"
|
1527
|
+
)
|
1528
|
+
dprint(msg)
|
1529
|
+
|
1216
1530
|
if cast_to_numeric:
|
1217
1531
|
common_dtypes[col] = attempt_cast_to_numeric
|
1218
1532
|
common_diff_dtypes[col] = attempt_cast_to_numeric
|
@@ -1229,7 +1543,7 @@ def enforce_dtypes(
|
|
1229
1543
|
)
|
1230
1544
|
except Exception as e:
|
1231
1545
|
if debug:
|
1232
|
-
dprint(f"Encountered an error when casting column {d} to type {t}:\n{e}")
|
1546
|
+
dprint(f"Encountered an error when casting column {d} to type {t}:\n{e}\ndf:\n{df}")
|
1233
1547
|
if 'int' in str(t).lower():
|
1234
1548
|
try:
|
1235
1549
|
df[d] = df[d].astype('float64').astype(t)
|
@@ -1425,10 +1739,15 @@ def df_from_literal(
|
|
1425
1739
|
from meerschaum.utils.packages import import_pandas
|
1426
1740
|
from meerschaum.utils.warnings import error, warn
|
1427
1741
|
from meerschaum.utils.debug import dprint
|
1742
|
+
from meerschaum.utils.dtypes import get_current_timestamp
|
1428
1743
|
|
1429
1744
|
if pipe is None or literal is None:
|
1430
1745
|
error("Please provide a Pipe and a literal value")
|
1431
|
-
|
1746
|
+
|
1747
|
+
dt_col = pipe.columns.get(
|
1748
|
+
'datetime',
|
1749
|
+
mrsm.get_config('pipes', 'autotime', 'column_name_if_datetime_missing')
|
1750
|
+
)
|
1432
1751
|
val_col = pipe.get_val_column(debug=debug)
|
1433
1752
|
|
1434
1753
|
val = literal
|
@@ -1445,9 +1764,7 @@ def df_from_literal(
|
|
1445
1764
|
)
|
1446
1765
|
val = literal
|
1447
1766
|
|
1448
|
-
|
1449
|
-
now = datetime.now(timezone.utc).replace(tzinfo=None)
|
1450
|
-
|
1767
|
+
now = get_current_timestamp(pipe.precision)
|
1451
1768
|
pd = import_pandas()
|
1452
1769
|
return pd.DataFrame({dt_col: [now], val_col: [val]})
|
1453
1770
|
|