meerschaum 3.0.0rc1__py3-none-any.whl → 3.0.0rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. meerschaum/_internal/arguments/_parser.py +2 -1
  2. meerschaum/_internal/docs/index.py +49 -2
  3. meerschaum/_internal/shell/Shell.py +5 -4
  4. meerschaum/_internal/static.py +8 -24
  5. meerschaum/actions/bootstrap.py +1 -1
  6. meerschaum/actions/edit.py +6 -3
  7. meerschaum/actions/start.py +1 -1
  8. meerschaum/actions/verify.py +5 -8
  9. meerschaum/api/__init__.py +2 -1
  10. meerschaum/api/dash/__init__.py +0 -2
  11. meerschaum/api/dash/callbacks/__init__.py +1 -0
  12. meerschaum/api/dash/callbacks/dashboard.py +20 -19
  13. meerschaum/api/dash/callbacks/jobs.py +11 -5
  14. meerschaum/api/dash/callbacks/pipes.py +106 -5
  15. meerschaum/api/dash/callbacks/settings/__init__.py +0 -1
  16. meerschaum/api/dash/callbacks/{settings/tokens.py → tokens.py} +1 -1
  17. meerschaum/api/dash/jobs.py +1 -1
  18. meerschaum/api/dash/pages/__init__.py +2 -1
  19. meerschaum/api/dash/pages/{job.py → jobs.py} +10 -7
  20. meerschaum/api/dash/pages/pipes.py +4 -3
  21. meerschaum/api/dash/pages/settings/__init__.py +0 -1
  22. meerschaum/api/dash/pages/{settings/tokens.py → tokens.py} +6 -8
  23. meerschaum/api/dash/pipes.py +131 -0
  24. meerschaum/api/dash/tokens.py +28 -31
  25. meerschaum/api/routes/_pipes.py +47 -37
  26. meerschaum/config/_default.py +13 -2
  27. meerschaum/config/_paths.py +1 -0
  28. meerschaum/config/_version.py +1 -1
  29. meerschaum/config/stack/__init__.py +9 -8
  30. meerschaum/connectors/api/_pipes.py +2 -18
  31. meerschaum/connectors/api/_tokens.py +2 -2
  32. meerschaum/connectors/instance/_tokens.py +10 -6
  33. meerschaum/connectors/sql/_SQLConnector.py +14 -0
  34. meerschaum/connectors/sql/_create_engine.py +3 -14
  35. meerschaum/connectors/sql/_pipes.py +175 -185
  36. meerschaum/connectors/sql/_sql.py +38 -20
  37. meerschaum/connectors/sql/tables/__init__.py +237 -122
  38. meerschaum/connectors/valkey/_pipes.py +44 -16
  39. meerschaum/core/Pipe/__init__.py +28 -5
  40. meerschaum/core/Pipe/_attributes.py +273 -46
  41. meerschaum/core/Pipe/_data.py +55 -17
  42. meerschaum/core/Pipe/_dtypes.py +19 -4
  43. meerschaum/core/Pipe/_edit.py +2 -0
  44. meerschaum/core/Pipe/_fetch.py +1 -1
  45. meerschaum/core/Pipe/_sync.py +90 -160
  46. meerschaum/core/Pipe/_verify.py +3 -3
  47. meerschaum/core/Token/_Token.py +4 -5
  48. meerschaum/plugins/bootstrap.py +508 -3
  49. meerschaum/utils/_get_pipes.py +1 -1
  50. meerschaum/utils/dataframe.py +385 -68
  51. meerschaum/utils/debug.py +15 -15
  52. meerschaum/utils/dtypes/__init__.py +387 -22
  53. meerschaum/utils/dtypes/sql.py +327 -31
  54. meerschaum/utils/misc.py +9 -68
  55. meerschaum/utils/packages/__init__.py +7 -21
  56. meerschaum/utils/packages/_packages.py +7 -2
  57. meerschaum/utils/schedule.py +1 -1
  58. meerschaum/utils/sql.py +8 -8
  59. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/METADATA +5 -17
  60. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/RECORD +66 -65
  61. meerschaum-3.0.0rc3.dist-info/licenses/NOTICE +2 -0
  62. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/WHEEL +0 -0
  63. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/entry_points.txt +0 -0
  64. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/licenses/LICENSE +0 -0
  65. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/top_level.txt +0 -0
  66. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/zip-safe +0 -0
@@ -8,14 +8,13 @@ Utility functions for working with DataFrames.
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- import pathlib
12
- from datetime import datetime, timezone
11
+ from datetime import datetime, timezone, date
13
12
  from collections import defaultdict
14
13
 
15
14
  import meerschaum as mrsm
16
15
  from meerschaum.utils.typing import (
17
16
  Optional, Dict, Any, List, Hashable, Generator,
18
- Iterator, Iterable, Union, TYPE_CHECKING,
17
+ Iterator, Iterable, Union, TYPE_CHECKING, Tuple,
19
18
  )
20
19
 
21
20
  if TYPE_CHECKING:
@@ -249,8 +248,10 @@ def filter_unseen_df(
249
248
  cast_dt_cols = True
250
249
  try:
251
250
  for col, typ in dt_dtypes.items():
251
+ _dtypes_col_dtype = str((dtypes or {}).get(col, 'datetime'))
252
252
  strip_utc = (
253
- (dtypes or {}).get(col, 'datetime') == 'datetime64[ns]'
253
+ _dtypes_col_dtype.startswith('datetime64')
254
+ and 'utc' not in _dtypes_col_dtype.lower()
254
255
  )
255
256
  if col in old_df.columns:
256
257
  old_df[col] = coerce_timezone(old_df[col], strip_utc=strip_utc)
@@ -327,8 +328,10 @@ def filter_unseen_df(
327
328
  if are_dtypes_equal(str(typ), 'datetime')
328
329
  ]
329
330
  for col in old_dt_cols:
331
+ _dtypes_col_dtype = str((dtypes or {}).get(col, 'datetime'))
330
332
  strip_utc = (
331
- (dtypes or {}).get(col, 'datetime') == 'datetime64[ns]'
333
+ _dtypes_col_dtype.startswith('datetime64')
334
+ and 'utc' not in _dtypes_col_dtype.lower()
332
335
  )
333
336
  old_df[col] = coerce_timezone(old_df[col], strip_utc=strip_utc)
334
337
 
@@ -338,8 +341,10 @@ def filter_unseen_df(
338
341
  if are_dtypes_equal(str(typ), 'datetime')
339
342
  ]
340
343
  for col in new_dt_cols:
344
+ _dtypes_col_dtype = str((dtypes or {}).get(col, 'datetime'))
341
345
  strip_utc = (
342
- (dtypes or {}).get(col, 'datetime') == 'datetime64[ns]'
346
+ _dtypes_col_dtype.startswith('datetime64')
347
+ and 'utc' not in _dtypes_col_dtype.lower()
343
348
  )
344
349
  new_df[col] = coerce_timezone(new_df[col], strip_utc=strip_utc)
345
350
 
@@ -423,6 +428,8 @@ def parse_df_datetimes(
423
428
  chunksize: Optional[int] = None,
424
429
  dtype_backend: str = 'numpy_nullable',
425
430
  ignore_all: bool = False,
431
+ precision_unit: Optional[str] = None,
432
+ coerce_utc: bool = True,
426
433
  debug: bool = False,
427
434
  ) -> 'pd.DataFrame':
428
435
  """
@@ -450,6 +457,12 @@ def parse_df_datetimes(
450
457
  ignore_all: bool, default False
451
458
  If `True`, do not attempt to cast any columns to datetimes.
452
459
 
460
+ precision_unit: Optional[str], default None
461
+ If provided, enforce the given precision on the coerced datetime columns.
462
+
463
+ coerce_utc: bool, default True
464
+ Coerce the datetime columns to UTC (see `meerschaum.utils.dtypes.to_datetime()`).
465
+
453
466
  debug: bool, default False
454
467
  Verbosity toggle.
455
468
 
@@ -466,9 +479,9 @@ def parse_df_datetimes(
466
479
  >>> df.dtypes
467
480
  a object
468
481
  dtype: object
469
- >>> df = parse_df_datetimes(df)
470
- >>> df.dtypes
471
- a datetime64[ns]
482
+ >>> df2 = parse_df_datetimes(df)
483
+ >>> df2.dtypes
484
+ a datetime64[us, UTC]
472
485
  dtype: object
473
486
 
474
487
  ```
@@ -478,8 +491,9 @@ def parse_df_datetimes(
478
491
  from meerschaum.utils.debug import dprint
479
492
  from meerschaum.utils.warnings import warn
480
493
  from meerschaum.utils.misc import items_str
481
- from meerschaum.utils.dtypes import to_datetime
494
+ from meerschaum.utils.dtypes import to_datetime, MRSM_PD_DTYPES
482
495
  import traceback
496
+
483
497
  pd = import_pandas()
484
498
  pandas = attempt_import('pandas')
485
499
  pd_name = pd.__name__
@@ -567,22 +581,25 @@ def parse_df_datetimes(
567
581
  if debug:
568
582
  dprint("Converting columns to datetimes: " + str(datetime_cols))
569
583
 
584
+ def _parse_to_datetime(x):
585
+ return to_datetime(x, precision_unit=precision_unit, coerce_utc=coerce_utc)
586
+
570
587
  try:
571
588
  if not using_dask:
572
- df[datetime_cols] = df[datetime_cols].apply(to_datetime)
589
+ df[datetime_cols] = df[datetime_cols].apply(_parse_to_datetime)
573
590
  else:
574
591
  df[datetime_cols] = df[datetime_cols].apply(
575
- to_datetime,
592
+ _parse_to_datetime,
576
593
  utc=True,
577
594
  axis=1,
578
595
  meta={
579
- col: 'datetime64[ns, UTC]'
596
+ col: MRSM_PD_DTYPES['datetime']
580
597
  for col in datetime_cols
581
598
  }
582
599
  )
583
600
  except Exception:
584
601
  warn(
585
- f"Unable to apply `pd.to_datetime` to {items_str(datetime_cols)}:\n"
602
+ f"Unable to apply `to_datetime()` to {items_str(datetime_cols)}:\n"
586
603
  + f"{traceback.format_exc()}"
587
604
  )
588
605
 
@@ -660,8 +677,7 @@ def get_json_cols(df: 'pd.DataFrame') -> List[str]:
660
677
  for col, ix in cols_indices.items()
661
678
  if (
662
679
  ix is not None
663
- and
664
- not isinstance(df.loc[ix][col], Hashable)
680
+ and isinstance(df.loc[ix][col], (dict, list))
665
681
  )
666
682
  ]
667
683
 
@@ -704,6 +720,38 @@ def get_numeric_cols(df: 'pd.DataFrame') -> List[str]:
704
720
  ]
705
721
 
706
722
 
723
+ def get_bool_cols(df: 'pd.DataFrame') -> List[str]:
724
+ """
725
+ Get the columns which contain `bool` objects from a Pandas DataFrame.
726
+
727
+ Parameters
728
+ ----------
729
+ df: pd.DataFrame
730
+ The DataFrame which may contain bools.
731
+
732
+ Returns
733
+ -------
734
+ A list of columns to treat as bools.
735
+ """
736
+ if df is None:
737
+ return []
738
+
739
+ is_dask = 'dask' in df.__module__
740
+ if is_dask:
741
+ df = get_first_valid_dask_partition(df)
742
+
743
+ if len(df) == 0:
744
+ return []
745
+
746
+ from meerschaum.utils.dtypes import are_dtypes_equal
747
+
748
+ return [
749
+ col
750
+ for col, typ in df.dtypes.items()
751
+ if are_dtypes_equal(str(typ), 'bool')
752
+ ]
753
+
754
+
707
755
  def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
708
756
  """
709
757
  Get the columns which contain `uuid.UUID` objects from a Pandas DataFrame.
@@ -746,7 +794,8 @@ def get_datetime_cols(
746
794
  df: 'pd.DataFrame',
747
795
  timezone_aware: bool = True,
748
796
  timezone_naive: bool = True,
749
- ) -> List[str]:
797
+ with_tz_precision: bool = False,
798
+ ) -> Union[List[str], Dict[str, Tuple[Union[str, None], str]]]:
750
799
  """
751
800
  Get the columns which contain `datetime` or `Timestamp` objects from a Pandas DataFrame.
752
801
 
@@ -761,76 +810,233 @@ def get_datetime_cols(
761
810
  timezone_naive: bool, default True
762
811
  If `True`, include timezone-naive datetime columns.
763
812
 
813
+ with_tz_precision: bool, default False
814
+ If `True`, return a dictionary mapping column names to tuples in the form
815
+ `(timezone, precision)`.
816
+
764
817
  Returns
765
818
  -------
766
- A list of columns to treat as datetimes.
819
+ A list of columns to treat as datetimes, or a dictionary of columns to tz+precision tuples
820
+ (if `with_tz_precision` is `True`).
767
821
  """
768
822
  if not timezone_aware and not timezone_naive:
769
823
  raise ValueError("`timezone_aware` and `timezone_naive` cannot both be `False`.")
770
824
 
771
825
  if df is None:
772
- return []
826
+ return [] if not with_tz_precision else {}
773
827
 
774
828
  from datetime import datetime
775
- from meerschaum.utils.dtypes import are_dtypes_equal
829
+ from meerschaum.utils.dtypes import are_dtypes_equal, MRSM_PRECISION_UNITS_ALIASES
776
830
  is_dask = 'dask' in df.__module__
777
831
  if is_dask:
778
832
  df = get_first_valid_dask_partition(df)
833
+
834
+ def get_tz_precision_from_dtype(dtype: str) -> Tuple[Union[str, None], str]:
835
+ """
836
+ Extract the tz + precision tuple from a dtype string.
837
+ """
838
+ meta_str = dtype.split('[', maxsplit=1)[-1].rstrip(']').replace(' ', '')
839
+ tz = (
840
+ None
841
+ if ',' not in meta_str
842
+ else meta_str.split(',', maxsplit=1)[-1]
843
+ )
844
+ precision_abbreviation = (
845
+ meta_str
846
+ if ',' not in meta_str
847
+ else meta_str.split(',')[0]
848
+ )
849
+ precision = MRSM_PRECISION_UNITS_ALIASES[precision_abbreviation]
850
+ return tz, precision
779
851
 
780
- known_dt_cols = [
781
- col
852
+ def get_tz_precision_from_datetime(dt: datetime) -> Tuple[Union[str, None], str]:
853
+ """
854
+ Return the tz + precision tuple from a Python datetime object.
855
+ """
856
+ return dt.tzname(), 'microsecond'
857
+
858
+ known_dt_cols_types = {
859
+ col: str(typ)
782
860
  for col, typ in df.dtypes.items()
783
861
  if are_dtypes_equal('datetime', str(typ))
784
- ]
862
+ }
863
+
864
+ known_dt_cols_tuples = {
865
+ col: get_tz_precision_from_dtype(typ)
866
+ for col, typ in known_dt_cols_types.items()
867
+ }
785
868
 
786
869
  if len(df) == 0:
787
- return known_dt_cols
870
+ return (
871
+ list(known_dt_cols_types)
872
+ if not with_tz_precision
873
+ else known_dt_cols_tuples
874
+ )
788
875
 
789
876
  cols_indices = {
790
877
  col: df[col].first_valid_index()
791
878
  for col in df.columns
792
- if col not in known_dt_cols
879
+ if col not in known_dt_cols_types
793
880
  }
794
- pydt_cols = [
795
- col
881
+ pydt_cols_tuples = {
882
+ col: get_tz_precision_from_datetime(sample_val)
796
883
  for col, ix in cols_indices.items()
797
884
  if (
798
885
  ix is not None
799
886
  and
800
- isinstance(df.loc[ix][col], datetime)
887
+ isinstance((sample_val := df.loc[ix][col]), datetime)
801
888
  )
802
- ]
803
- dt_cols_set = set(known_dt_cols + pydt_cols)
804
- all_dt_cols = [
805
- col
889
+ }
890
+
891
+ dt_cols_tuples = {
892
+ **known_dt_cols_tuples,
893
+ **pydt_cols_tuples
894
+ }
895
+
896
+ all_dt_cols_tuples = {
897
+ col: dt_cols_tuples[col]
806
898
  for col in df.columns
807
- if col in dt_cols_set
808
- ]
899
+ if col in dt_cols_tuples
900
+ }
809
901
  if timezone_aware and timezone_naive:
810
- return all_dt_cols
902
+ return (
903
+ list(all_dt_cols_tuples)
904
+ if not with_tz_precision
905
+ else all_dt_cols_tuples
906
+ )
811
907
 
812
908
  known_timezone_aware_dt_cols = [
813
909
  col
814
- for col in known_dt_cols
910
+ for col in known_dt_cols_types
815
911
  if getattr(df[col], 'tz', None) is not None
816
912
  ]
817
- timezone_aware_pydt_cols = [
818
- col
819
- for col in pydt_cols
913
+ timezone_aware_pydt_cols_tuples = {
914
+ col: (tz, precision)
915
+ for col, (tz, precision) in pydt_cols_tuples.items()
820
916
  if df.loc[cols_indices[col]][col].tzinfo is not None
821
- ]
822
- timezone_aware_dt_cols_set = set(known_timezone_aware_dt_cols + timezone_aware_pydt_cols)
917
+ }
918
+ timezone_aware_dt_cols_set = set(
919
+ known_timezone_aware_dt_cols + list(timezone_aware_pydt_cols_tuples)
920
+ )
921
+ timezone_aware_cols_tuples = {
922
+ col: (tz, precision)
923
+ for col, (tz, precision) in all_dt_cols_tuples.items()
924
+ if col in timezone_aware_dt_cols_set
925
+ }
926
+ timezone_naive_cols_tuples = {
927
+ col: (tz, precision)
928
+ for col, (tz, precision) in all_dt_cols_tuples.items()
929
+ if col not in timezone_aware_dt_cols_set
930
+ }
931
+
823
932
  if timezone_aware:
824
- return [
825
- col
826
- for col in all_dt_cols
827
- if col in timezone_aware_pydt_cols
828
- ]
933
+ return (
934
+ list(timezone_aware_cols_tuples)
935
+ if not with_tz_precision
936
+ else timezone_aware_cols_tuples
937
+ )
938
+
939
+ return (
940
+ list(timezone_naive_cols_tuples)
941
+ if not with_tz_precision
942
+ else timezone_naive_cols_tuples
943
+ )
944
+
945
+
946
+ def get_datetime_cols_types(df: 'pd.DataFrame') -> Dict[str, str]:
947
+ """
948
+ Return a dictionary mapping datetime columns to specific types strings.
949
+
950
+ Parameters
951
+ ----------
952
+ df: pd.DataFrame
953
+ The DataFrame which may contain datetime columns.
954
+
955
+ Returns
956
+ -------
957
+ A dictionary mapping the datetime columns' names to dtype strings
958
+ (containing timezone and precision metadata).
959
+
960
+ Examples
961
+ --------
962
+ >>> from datetime import datetime, timezone
963
+ >>> import pandas as pd
964
+ >>> df = pd.DataFrame({'dt_tz_aware': [datetime(2025, 1, 1, tzinfo=timezone.utc)]})
965
+ >>> get_datetime_cols_types(df)
966
+ {'dt_tz_aware': 'datetime64[us, UTC]'}
967
+ >>> df = pd.DataFrame({'distant_dt': [datetime(1, 1, 1)]})
968
+ >>> get_datetime_cols_types(df)
969
+ {'distant_dt': 'datetime64[us]'}
970
+ >>> df = pd.DataFrame({'dt_second': datetime(2025, 1, 1)})
971
+ >>> df['dt_second'] = df['dt_second'].astype('datetime64[s]')
972
+ >>> get_datetime_cols_types(df)
973
+ {'dt_second': 'datetime64[s]'}
974
+ """
975
+ from meerschaum.utils.dtypes import MRSM_PRECISION_UNITS_ABBREVIATIONS
976
+ dt_cols_tuples = get_datetime_cols(df, with_tz_precision=True)
977
+ if not dt_cols_tuples:
978
+ return {}
979
+
980
+ return {
981
+ col: (
982
+ f"datetime64[{MRSM_PRECISION_UNITS_ABBREVIATIONS[precision]}]"
983
+ if tz is None
984
+ else f"datetime64[{MRSM_PRECISION_UNITS_ABBREVIATIONS[precision]}, {tz}]"
985
+ )
986
+ for col, (tz, precision) in dt_cols_tuples.items()
987
+ }
988
+
989
+
990
+ def get_date_cols(df: 'pd.DataFrame') -> List[str]:
991
+ """
992
+ Get the `date` columns from a Pandas DataFrame.
993
+
994
+ Parameters
995
+ ----------
996
+ df: pd.DataFrame
997
+ The DataFrame which may contain dates.
998
+
999
+ Returns
1000
+ -------
1001
+ A list of columns to treat as dates.
1002
+ """
1003
+ from meerschaum.utils.dtypes import are_dtypes_equal
1004
+ if df is None:
1005
+ return []
1006
+
1007
+ is_dask = 'dask' in df.__module__
1008
+ if is_dask:
1009
+ df = get_first_valid_dask_partition(df)
1010
+
1011
+ known_date_cols = [
1012
+ col
1013
+ for col, typ in df.dtypes.items()
1014
+ if are_dtypes_equal(typ, 'date')
1015
+ ]
1016
+
1017
+ if len(df) == 0:
1018
+ return known_date_cols
1019
+
1020
+ cols_indices = {
1021
+ col: df[col].first_valid_index()
1022
+ for col in df.columns
1023
+ if col not in known_date_cols
1024
+ }
1025
+ object_date_cols = [
1026
+ col
1027
+ for col, ix in cols_indices.items()
1028
+ if (
1029
+ ix is not None
1030
+ and isinstance(df.loc[ix][col], date)
1031
+ )
1032
+ ]
1033
+
1034
+ all_date_cols = set(known_date_cols + object_date_cols)
829
1035
 
830
1036
  return [
831
1037
  col
832
- for col in all_dt_cols
833
- if col not in timezone_aware_dt_cols_set
1038
+ for col in df.columns
1039
+ if col in all_date_cols
834
1040
  ]
835
1041
 
836
1042
 
@@ -849,27 +1055,42 @@ def get_bytes_cols(df: 'pd.DataFrame') -> List[str]:
849
1055
  """
850
1056
  if df is None:
851
1057
  return []
1058
+
852
1059
  is_dask = 'dask' in df.__module__
853
1060
  if is_dask:
854
1061
  df = get_first_valid_dask_partition(df)
855
1062
 
1063
+ known_bytes_cols = [
1064
+ col
1065
+ for col, typ in df.dtypes.items()
1066
+ if str(typ) == 'binary[pyarrow]'
1067
+ ]
1068
+
856
1069
  if len(df) == 0:
857
- return []
1070
+ return known_bytes_cols
858
1071
 
859
1072
  cols_indices = {
860
1073
  col: df[col].first_valid_index()
861
1074
  for col in df.columns
1075
+ if col not in known_bytes_cols
862
1076
  }
863
- return [
1077
+ object_bytes_cols = [
864
1078
  col
865
1079
  for col, ix in cols_indices.items()
866
1080
  if (
867
1081
  ix is not None
868
- and
869
- isinstance(df.loc[ix][col], bytes)
1082
+ and isinstance(df.loc[ix][col], bytes)
870
1083
  )
871
1084
  ]
872
1085
 
1086
+ all_bytes_cols = set(known_bytes_cols + object_bytes_cols)
1087
+
1088
+ return [
1089
+ col
1090
+ for col in df.columns
1091
+ if col in all_bytes_cols
1092
+ ]
1093
+
873
1094
 
874
1095
  def get_geometry_cols(
875
1096
  df: 'pd.DataFrame',
@@ -892,14 +1113,14 @@ def get_geometry_cols(
892
1113
  If `with_types_srids`, return a dictionary mapping columns to tuples in the form (type, SRID).
893
1114
  """
894
1115
  if df is None:
895
- return []
1116
+ return [] if not with_types_srids else {}
896
1117
 
897
1118
  is_dask = 'dask' in df.__module__
898
1119
  if is_dask:
899
1120
  df = get_first_valid_dask_partition(df)
900
1121
 
901
1122
  if len(df) == 0:
902
- return []
1123
+ return [] if not with_types_srids else {}
903
1124
 
904
1125
  cols_indices = {
905
1126
  col: df[col].first_valid_index()
@@ -948,11 +1169,54 @@ def get_geometry_cols(
948
1169
  return geo_cols_types_srids
949
1170
 
950
1171
 
1172
+ def get_geometry_cols_types(df: 'pd.DataFrame') -> Dict[str, str]:
1173
+ """
1174
+ Return a dtypes dictionary mapping columns to specific geometry types (type, srid).
1175
+ """
1176
+ geometry_cols_types_srids = get_geometry_cols(df, with_types_srids=True)
1177
+ new_cols_types = {}
1178
+ for col, (geometry_type, srid) in geometry_cols_types_srids.items():
1179
+ new_dtype = "geometry"
1180
+ modifier = ""
1181
+ if not srid and geometry_type.lower() == 'geometry':
1182
+ new_cols_types[col] = new_dtype
1183
+ continue
1184
+
1185
+ modifier = "["
1186
+ if geometry_type.lower() != 'geometry':
1187
+ modifier += f"{geometry_type}"
1188
+
1189
+ if srid:
1190
+ if modifier != '[':
1191
+ modifier += ", "
1192
+ modifier += f"{srid}"
1193
+ modifier += "]"
1194
+ new_cols_types[col] = f"{new_dtype}{modifier}"
1195
+ return new_cols_types
1196
+
1197
+
1198
+ def get_special_cols(df: 'pd.DataFrame') -> Dict[str, str]:
1199
+ """
1200
+ Return a dtypes dictionary mapping special columns to their dtypes.
1201
+ """
1202
+ return {
1203
+ **{col: 'json' for col in get_json_cols(df)},
1204
+ **{col: 'uuid' for col in get_uuid_cols(df)},
1205
+ **{col: 'bytes' for col in get_bytes_cols(df)},
1206
+ **{col: 'bool' for col in get_bool_cols(df)},
1207
+ **{col: 'numeric' for col in get_numeric_cols(df)},
1208
+ **{col: 'date' for col in get_date_cols(df)},
1209
+ **get_datetime_cols_types(df),
1210
+ **get_geometry_cols_types(df),
1211
+ }
1212
+
1213
+
951
1214
  def enforce_dtypes(
952
1215
  df: 'pd.DataFrame',
953
1216
  dtypes: Dict[str, str],
1217
+ explicit_dtypes: Optional[Dict[str, str]] = None,
954
1218
  safe_copy: bool = True,
955
- coerce_numeric: bool = True,
1219
+ coerce_numeric: bool = False,
956
1220
  coerce_timezone: bool = True,
957
1221
  strip_timezone: bool = False,
958
1222
  debug: bool = False,
@@ -968,12 +1232,16 @@ def enforce_dtypes(
968
1232
  dtypes: Dict[str, str]
969
1233
  The data types to attempt to enforce on the DataFrame.
970
1234
 
1235
+ explicit_dtypes: Optional[Dict[str, str]], default None
1236
+ If provided, automatic dtype coersion will respect explicitly configured
1237
+ dtypes (`int`, `float`, `numeric`).
1238
+
971
1239
  safe_copy: bool, default True
972
1240
  If `True`, create a copy before comparing and modifying the dataframes.
973
1241
  Setting to `False` may mutate the DataFrames.
974
1242
  See `meerschaum.utils.dataframe.filter_unseen_df`.
975
1243
 
976
- coerce_numeric: bool, default True
1244
+ coerce_numeric: bool, default False
977
1245
  If `True`, convert float and int collisions to numeric.
978
1246
 
979
1247
  coerce_timezone: bool, default True
@@ -1015,6 +1283,7 @@ def enforce_dtypes(
1015
1283
  dprint("Incoming DataFrame has no columns. Skipping enforcement...")
1016
1284
  return df
1017
1285
 
1286
+ explicit_dtypes = explicit_dtypes or {}
1018
1287
  pipe_pandas_dtypes = {
1019
1288
  col: to_pandas_dtype(typ)
1020
1289
  for col, typ in dtypes.items()
@@ -1121,11 +1390,24 @@ def enforce_dtypes(
1121
1390
  dprint(f"Checking for datetime conversion: {datetime_cols}")
1122
1391
  for col in datetime_cols:
1123
1392
  if col in df.columns:
1393
+ if not strip_timezone and 'utc' in str(df.dtypes[col]).lower():
1394
+ if debug:
1395
+ dprint(f"Skip UTC coersion for column '{col}' ({str(df[col].dtype)}).")
1396
+ continue
1397
+ if strip_timezone and ',' not in str(df.dtypes[col]):
1398
+ if debug:
1399
+ dprint(
1400
+ f"Skip UTC coersion (stripped) for column '{col}' "
1401
+ f"({str(df[col].dtype)})."
1402
+ )
1403
+ continue
1404
+
1124
1405
  if debug:
1125
1406
  dprint(
1126
1407
  f"Data type for column '{col}' before timezone coersion: "
1127
1408
  f"{str(df[col].dtype)}"
1128
1409
  )
1410
+
1129
1411
  df[col] = _coerce_timezone(df[col], strip_utc=strip_timezone)
1130
1412
  if debug:
1131
1413
  dprint(
@@ -1206,13 +1488,45 @@ def enforce_dtypes(
1206
1488
  for col, typ in {k: v for k, v in common_diff_dtypes.items()}.items():
1207
1489
  previous_typ = common_dtypes[col]
1208
1490
  mixed_numeric_types = (is_dtype_numeric(typ) and is_dtype_numeric(previous_typ))
1209
- explicitly_float = are_dtypes_equal(dtypes.get(col, 'object'), 'float')
1210
- explicitly_numeric = dtypes.get(col, 'numeric').startswith('numeric')
1211
- cast_to_numeric = (
1212
- explicitly_numeric
1213
- or col in df_numeric_cols
1214
- or (mixed_numeric_types and not explicitly_float)
1215
- ) and coerce_numeric
1491
+ explicitly_float = are_dtypes_equal(explicit_dtypes.get(col, 'object'), 'float')
1492
+ explicitly_int = are_dtypes_equal(explicit_dtypes.get(col, 'object'), 'int')
1493
+ explicitly_numeric = explicit_dtypes.get(col, 'object').startswith('numeric')
1494
+ all_nan = (
1495
+ df[col].isnull().all()
1496
+ if mixed_numeric_types and coerce_numeric and not (explicitly_float or explicitly_int)
1497
+ else None
1498
+ )
1499
+ cast_to_numeric = explicitly_numeric or (
1500
+ (
1501
+ col in df_numeric_cols
1502
+ or (
1503
+ mixed_numeric_types
1504
+ and not (explicitly_float or explicitly_int)
1505
+ and not all_nan
1506
+ and coerce_numeric
1507
+ )
1508
+ )
1509
+ )
1510
+
1511
+ if debug and (explicitly_numeric or df_numeric_cols or mixed_numeric_types):
1512
+ from meerschaum.utils.formatting import make_header
1513
+ msg = (
1514
+ make_header(f"Coercing column '{col}' to numeric:", left_pad=0)
1515
+ + "\n"
1516
+ + f" Previous type: {previous_typ}\n"
1517
+ + f" Current type: {typ if col not in df_numeric_cols else 'Decimal'}"
1518
+ + ("\n Column is explicitly numeric." if explicitly_numeric else "")
1519
+ ) if cast_to_numeric else (
1520
+ f"Will not coerce column '{col}' to numeric.\n"
1521
+ f" Numeric columns in dataframe: {df_numeric_cols}\n"
1522
+ f" Mixed numeric types: {mixed_numeric_types}\n"
1523
+ f" Explicitly float: {explicitly_float}\n"
1524
+ f" Explicitly int: {explicitly_int}\n"
1525
+ f" All NaN: {all_nan}\n"
1526
+ f" Coerce numeric: {coerce_numeric}"
1527
+ )
1528
+ dprint(msg)
1529
+
1216
1530
  if cast_to_numeric:
1217
1531
  common_dtypes[col] = attempt_cast_to_numeric
1218
1532
  common_diff_dtypes[col] = attempt_cast_to_numeric
@@ -1229,7 +1543,7 @@ def enforce_dtypes(
1229
1543
  )
1230
1544
  except Exception as e:
1231
1545
  if debug:
1232
- dprint(f"Encountered an error when casting column {d} to type {t}:\n{e}")
1546
+ dprint(f"Encountered an error when casting column {d} to type {t}:\n{e}\ndf:\n{df}")
1233
1547
  if 'int' in str(t).lower():
1234
1548
  try:
1235
1549
  df[d] = df[d].astype('float64').astype(t)
@@ -1425,10 +1739,15 @@ def df_from_literal(
1425
1739
  from meerschaum.utils.packages import import_pandas
1426
1740
  from meerschaum.utils.warnings import error, warn
1427
1741
  from meerschaum.utils.debug import dprint
1742
+ from meerschaum.utils.dtypes import get_current_timestamp
1428
1743
 
1429
1744
  if pipe is None or literal is None:
1430
1745
  error("Please provide a Pipe and a literal value")
1431
- dt_col = pipe.columns.get('datetime', 'ts')
1746
+
1747
+ dt_col = pipe.columns.get(
1748
+ 'datetime',
1749
+ mrsm.get_config('pipes', 'autotime', 'column_name_if_datetime_missing')
1750
+ )
1432
1751
  val_col = pipe.get_val_column(debug=debug)
1433
1752
 
1434
1753
  val = literal
@@ -1445,9 +1764,7 @@ def df_from_literal(
1445
1764
  )
1446
1765
  val = literal
1447
1766
 
1448
- from datetime import datetime, timezone
1449
- now = datetime.now(timezone.utc).replace(tzinfo=None)
1450
-
1767
+ now = get_current_timestamp(pipe.precision)
1451
1768
  pd = import_pandas()
1452
1769
  return pd.DataFrame({dt_col: [now], val_col: [val]})
1453
1770