meerschaum 2.5.1__py3-none-any.whl → 2.6.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. meerschaum/_internal/arguments/_parser.py +6 -1
  2. meerschaum/actions/edit.py +6 -6
  3. meerschaum/actions/sql.py +12 -11
  4. meerschaum/config/_edit.py +46 -19
  5. meerschaum/config/_read_config.py +20 -9
  6. meerschaum/config/_version.py +1 -1
  7. meerschaum/config/stack/__init__.py +1 -1
  8. meerschaum/connectors/sql/_pipes.py +80 -24
  9. meerschaum/connectors/sql/_sql.py +29 -10
  10. meerschaum/connectors/valkey/_pipes.py +1 -1
  11. meerschaum/core/Pipe/__init__.py +8 -9
  12. meerschaum/core/Pipe/_attributes.py +33 -11
  13. meerschaum/core/Pipe/_data.py +26 -7
  14. meerschaum/core/Pipe/_dtypes.py +4 -4
  15. meerschaum/core/Pipe/_fetch.py +1 -1
  16. meerschaum/core/Pipe/_sync.py +16 -4
  17. meerschaum/core/Pipe/_verify.py +1 -1
  18. meerschaum/utils/dataframe.py +56 -29
  19. meerschaum/utils/dtypes/__init__.py +16 -5
  20. meerschaum/utils/dtypes/sql.py +58 -28
  21. meerschaum/utils/misc.py +49 -16
  22. meerschaum/utils/sql.py +224 -40
  23. {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/METADATA +1 -1
  24. {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/RECORD +30 -30
  25. {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/WHEEL +1 -1
  26. {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/LICENSE +0 -0
  27. {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/NOTICE +0 -0
  28. {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/entry_points.txt +0 -0
  29. {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/top_level.txt +0 -0
  30. {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/zip-safe +0 -0
@@ -103,10 +103,25 @@ def indices(self) -> Union[Dict[str, Union[str, List[str]]], None]:
103
103
  if indices_key not in self.parameters:
104
104
  self.parameters[indices_key] = {}
105
105
  _indices = self.parameters[indices_key]
106
+ _columns = self.columns
107
+ dt_col = _columns.get('datetime', None)
106
108
  if not isinstance(_indices, dict):
107
109
  _indices = {}
108
110
  self.parameters[indices_key] = _indices
109
- return {**self.columns, **_indices}
111
+ unique_cols = (
112
+ [dt_col]
113
+ if dt_col
114
+ else []
115
+ ) + [
116
+ col
117
+ for col_ix, col in _columns.items()
118
+ if col_ix != 'datetime'
119
+ ]
120
+ return {
121
+ **({'unique': unique_cols} if len(unique_cols) > 1 else {}),
122
+ **_columns,
123
+ **_indices
124
+ }
110
125
 
111
126
 
112
127
  @property
@@ -196,7 +211,7 @@ def get_columns(self, *args: str, error: bool = False) -> Union[str, Tuple[str]]
196
211
  ----------
197
212
  *args: str
198
213
  The column names to be retrieved.
199
-
214
+
200
215
  error: bool, default False
201
216
  If `True`, raise an `Exception` if the specified column is not defined.
202
217
 
@@ -509,15 +524,22 @@ def get_indices(self) -> Dict[str, str]:
509
524
  if cols
510
525
  }
511
526
  _index_names = {
512
- ix: (
513
- _index_template.format(
514
- target=_target,
515
- column_names=column_names,
516
- connector_keys=self.connector_keys,
517
- metric_key=self.connector_key,
518
- location_key=self.location_key,
519
- )
527
+ ix: _index_template.format(
528
+ target=_target,
529
+ column_names=column_names,
530
+ connector_keys=self.connector_keys,
531
+ metric_key=self.connector_key,
532
+ location_key=self.location_key,
520
533
  )
521
534
  for ix, column_names in _column_names.items()
522
535
  }
523
- return _index_names
536
+ ### NOTE: Skip any duplicate indices.
537
+ seen_index_names = {}
538
+ for ix, index_name in _index_names.items():
539
+ if index_name in seen_index_names:
540
+ continue
541
+ seen_index_names[index_name] = ix
542
+ return {
543
+ ix: index_name
544
+ for index_name, ix in seen_index_names.items()
545
+ }
@@ -23,8 +23,8 @@ def get_data(
23
23
  self,
24
24
  select_columns: Optional[List[str]] = None,
25
25
  omit_columns: Optional[List[str]] = None,
26
- begin: Union[datetime, int, None] = None,
27
- end: Union[datetime, int, None] = None,
26
+ begin: Union[datetime, int, str, None] = None,
27
+ end: Union[datetime, int, str, None] = None,
28
28
  params: Optional[Dict[str, Any]] = None,
29
29
  as_iterator: bool = False,
30
30
  as_chunks: bool = False,
@@ -48,12 +48,12 @@ def get_data(
48
48
  omit_columns: Optional[List[str]], default None
49
49
  If provided, remove these columns from the selection.
50
50
 
51
- begin: Union[datetime, int, None], default None
51
+ begin: Union[datetime, int, str, None], default None
52
52
  Lower bound datetime to begin searching for data (inclusive).
53
53
  Translates to a `WHERE` clause like `WHERE datetime >= begin`.
54
54
  Defaults to `None`.
55
55
 
56
- end: Union[datetime, int, None], default None
56
+ end: Union[datetime, int, str, None], default None
57
57
  Upper bound datetime to stop searching for data (inclusive).
58
58
  Translates to a `WHERE` clause like `WHERE datetime < end`.
59
59
  Defaults to `None`.
@@ -105,11 +105,12 @@ def get_data(
105
105
  from meerschaum.utils.venv import Venv
106
106
  from meerschaum.connectors import get_connector_plugin
107
107
  from meerschaum.utils.misc import iterate_chunks, items_str
108
- from meerschaum.utils.dtypes import to_pandas_dtype
108
+ from meerschaum.utils.dtypes import to_pandas_dtype, coerce_timezone
109
109
  from meerschaum.utils.dataframe import add_missing_cols_to_df, df_is_chunk_generator
110
110
  from meerschaum.utils.packages import attempt_import
111
111
  dd = attempt_import('dask.dataframe') if as_dask else None
112
112
  dask = attempt_import('dask') if as_dask else None
113
+ dateutil_parser = attempt_import('dateutil.parser')
113
114
 
114
115
  if select_columns == '*':
115
116
  select_columns = None
@@ -120,11 +121,29 @@ def get_data(
120
121
  omit_columns = [omit_columns]
121
122
 
122
123
  as_iterator = as_iterator or as_chunks
124
+ dt_col = self.columns.get('datetime', None)
125
+ dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
126
+ dt_is_utc = 'utc' in dt_typ.lower()
127
+ if isinstance(begin, str):
128
+ try:
129
+ begin = dateutil_parser.parse(begin)
130
+ except Exception as e:
131
+ warn(f"Failed to parse '{begin}' as datetime:\n{e}")
132
+ begin = None
133
+ if isinstance(end, str):
134
+ try:
135
+ end = dateutil_parser.parse(end)
136
+ except Exception as e:
137
+ warn(f"Failed to parse '{end}' as datetime:\n{e}")
138
+ end = None
139
+ if isinstance(begin, datetime):
140
+ begin = coerce_timezone(begin, strip_utc=(not dt_is_utc))
141
+ if isinstance(end, datetime):
142
+ end = coerce_timezone(end, strip_utc=(not dt_is_utc))
123
143
 
124
144
  def _sort_df(_df):
125
145
  if df_is_chunk_generator(_df):
126
146
  return _df
127
- dt_col = self.columns.get('datetime', None)
128
147
  indices = [] if dt_col not in _df.columns else [dt_col]
129
148
  non_dt_cols = [
130
149
  col
@@ -607,7 +626,7 @@ def get_chunk_interval(
607
626
  if dt_col is None:
608
627
  return timedelta(minutes=chunk_minutes)
609
628
 
610
- dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
629
+ dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
611
630
  if 'int' in dt_dtype.lower():
612
631
  return chunk_minutes
613
632
  return timedelta(minutes=chunk_minutes)
@@ -101,18 +101,18 @@ def infer_dtypes(self, persist: bool = False, debug: bool = False) -> Dict[str,
101
101
  dt_col = self.columns.get('datetime', None)
102
102
  if dt_col:
103
103
  if not self.parameters.get('dtypes', {}).get(dt_col, None):
104
- dtypes[dt_col] = 'datetime64[ns]'
104
+ dtypes[dt_col] = 'datetime64[ns, UTC]'
105
105
  return dtypes
106
106
 
107
- from meerschaum.utils.sql import get_pd_type
108
- from meerschaum.utils.misc import to_pandas_dtype
107
+ from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
108
+ from meerschaum.utils.dtypes import to_pandas_dtype
109
109
  columns_types = self.get_columns_types(debug=debug)
110
110
 
111
111
  ### NOTE: get_columns_types() may return either the types as
112
112
  ### PostgreSQL- or Pandas-style.
113
113
  dtypes = {
114
114
  c: (
115
- get_pd_type(t, allow_custom_dtypes=True)
115
+ get_pd_type_from_db_type(t, allow_custom_dtypes=True)
116
116
  if str(t).isupper()
117
117
  else to_pandas_dtype(t)
118
118
  )
@@ -125,7 +125,7 @@ def get_backtrack_interval(
125
125
  if dt_col is None:
126
126
  return backtrack_interval
127
127
 
128
- dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
128
+ dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
129
129
  if 'int' in dt_dtype.lower():
130
130
  return backtrack_minutes
131
131
 
@@ -624,6 +624,18 @@ def filter_existing(
624
624
  merge = pd.merge
625
625
  NA = pd.NA
626
626
 
627
+ primary_key = self.columns.get('primary', None)
628
+ autoincrement = self.parameters.get('autoincrement', False)
629
+ pipe_columns = self.columns.copy()
630
+
631
+ if primary_key and autoincrement and df is not None and primary_key in df.columns:
632
+ if safe_copy:
633
+ df = df.copy()
634
+ safe_copy = False
635
+ if df[primary_key].isnull().all():
636
+ del df[primary_key]
637
+ _ = self.columns.pop(primary_key, None)
638
+
627
639
  def get_empty_df():
628
640
  empty_df = pd.DataFrame([])
629
641
  dtypes = dict(df.dtypes) if df is not None else {}
@@ -643,8 +655,8 @@ def filter_existing(
643
655
 
644
656
  ### begin is the oldest data in the new dataframe
645
657
  begin, end = None, None
646
- dt_col = self.columns.get('datetime', None)
647
- dt_type = self.dtypes.get(dt_col, 'datetime64[ns]') if dt_col else None
658
+ dt_col = pipe_columns.get('datetime', None)
659
+ dt_type = self.dtypes.get(dt_col, 'datetime64[ns, UTC]') if dt_col else None
648
660
  try:
649
661
  min_dt_val = df[dt_col].min(skipna=True) if dt_col else None
650
662
  if is_dask and min_dt_val is not None:
@@ -713,7 +725,7 @@ def filter_existing(
713
725
 
714
726
  unique_index_vals = {
715
727
  col: df[col].unique()
716
- for col in self.columns
728
+ for col in pipe_columns
717
729
  if col in df.columns and col != dt_col
718
730
  } if not date_bound_only else {}
719
731
  filter_params_index_limit = get_config('pipes', 'sync', 'filter_params_index_limit')
@@ -749,7 +761,7 @@ def filter_existing(
749
761
 
750
762
  ### Separate new rows from changed ones.
751
763
  on_cols = [
752
- col for col_key, col in self.columns.items()
764
+ col for col_key, col in pipe_columns.items()
753
765
  if (
754
766
  col
755
767
  and
@@ -394,7 +394,7 @@ def get_bound_interval(self, debug: bool = False) -> Union[timedelta, int, None]
394
394
  if not dt_col:
395
395
  return bound_time_value
396
396
 
397
- dt_typ = self.dtypes.get(dt_col, 'datetime64[ns]')
397
+ dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
398
398
  if 'int' in dt_typ.lower():
399
399
  return int(bound_time_value)
400
400
 
@@ -94,14 +94,14 @@ def filter_unseen_df(
94
94
  ----------
95
95
  old_df: 'pd.DataFrame'
96
96
  The original (target) dataframe. Acts as a filter on the `new_df`.
97
-
97
+
98
98
  new_df: 'pd.DataFrame'
99
99
  The fetched (source) dataframe. Rows that are contained in `old_df` are removed.
100
100
 
101
101
  safe_copy: bool, default True
102
102
  If `True`, create a copy before comparing and modifying the dataframes.
103
103
  Setting to `False` may mutate the DataFrames.
104
-
104
+
105
105
  dtypes: Optional[Dict[str, Any]], default None
106
106
  Optionally specify the datatypes of the dataframe.
107
107
 
@@ -234,8 +234,10 @@ def filter_unseen_df(
234
234
  cast_dt_cols = True
235
235
  try:
236
236
  for col, typ in dt_dtypes.items():
237
- tz = typ.split(',')[-1].strip() if ',' in typ else None
238
- new_df[col] = coerce_timezone(pd.to_datetime(new_df[col], utc=True))
237
+ if col in old_df.columns:
238
+ old_df[col] = coerce_timezone(pd.to_datetime(old_df[col], utc=True))
239
+ if col in new_df.columns:
240
+ new_df[col] = coerce_timezone(pd.to_datetime(new_df[col], utc=True))
239
241
  cast_dt_cols = False
240
242
  except Exception as e:
241
243
  warn(f"Could not cast datetime columns:\n{e}")
@@ -363,6 +365,7 @@ def filter_unseen_df(
363
365
  def parse_df_datetimes(
364
366
  df: 'pd.DataFrame',
365
367
  ignore_cols: Optional[Iterable[str]] = None,
368
+ strip_timezone: bool = True,
366
369
  chunksize: Optional[int] = None,
367
370
  dtype_backend: str = 'numpy_nullable',
368
371
  debug: bool = False,
@@ -378,6 +381,9 @@ def parse_df_datetimes(
378
381
  ignore_cols: Optional[Iterable[str]], default None
379
382
  If provided, do not attempt to coerce these columns as datetimes.
380
383
 
384
+ strip_timezone: bool, default True
385
+ If `True`, remove the UTC `tzinfo` property.
386
+
381
387
  chunksize: Optional[int], default None
382
388
  If the pandas implementation is `'dask'`, use this chunksize for the distributed dataframe.
383
389
 
@@ -385,7 +391,7 @@ def parse_df_datetimes(
385
391
  If `df` is not a DataFrame and new one needs to be constructed,
386
392
  use this as the datatypes backend.
387
393
  Accepted values are 'numpy_nullable' and 'pyarrow'.
388
-
394
+
389
395
  debug: bool, default False
390
396
  Verbosity toggle.
391
397
 
@@ -447,7 +453,7 @@ def parse_df_datetimes(
447
453
  for doc in df
448
454
  ] for k in keys
449
455
  },
450
- npartitions = npartitions,
456
+ npartitions=npartitions,
451
457
  )
452
458
  elif isinstance(df, dict):
453
459
  df = pd.DataFrame.from_dict(df, npartitions=npartitions)
@@ -500,14 +506,18 @@ def parse_df_datetimes(
500
506
 
501
507
  try:
502
508
  if not using_dask:
503
- df[datetime_cols] = df[datetime_cols].apply(pd.to_datetime, utc=True)
509
+ df[datetime_cols] = df[datetime_cols].apply(
510
+ pd.to_datetime,
511
+ utc=True,
512
+ format='ISO8601',
513
+ )
504
514
  else:
505
515
  df[datetime_cols] = df[datetime_cols].apply(
506
516
  pd.to_datetime,
507
517
  utc=True,
508
518
  axis=1,
509
519
  meta={
510
- col: 'datetime64[ns]'
520
+ col: 'datetime64[ns, UTC]'
511
521
  for col in datetime_cols
512
522
  }
513
523
  )
@@ -517,11 +527,15 @@ def parse_df_datetimes(
517
527
  + f"{traceback.format_exc()}"
518
528
  )
519
529
 
520
- for dt in datetime_cols:
521
- try:
522
- df[dt] = df[dt].dt.tz_localize(None)
523
- except Exception:
524
- warn(f"Unable to convert column '{dt}' to naive datetime:\n{traceback.format_exc()}")
530
+ if strip_timezone:
531
+ for dt in datetime_cols:
532
+ try:
533
+ df[dt] = df[dt].dt.tz_localize(None)
534
+ except Exception:
535
+ warn(
536
+ f"Unable to convert column '{dt}' to naive datetime:\n"
537
+ + f"{traceback.format_exc()}"
538
+ )
525
539
 
526
540
  return df
527
541
 
@@ -674,6 +688,7 @@ def enforce_dtypes(
674
688
  dtypes: Dict[str, str],
675
689
  safe_copy: bool = True,
676
690
  coerce_numeric: bool = True,
691
+ coerce_timezone: bool = True,
677
692
  debug: bool = False,
678
693
  ) -> 'pd.DataFrame':
679
694
  """
@@ -695,6 +710,9 @@ def enforce_dtypes(
695
710
  coerce_numeric: bool, default True
696
711
  If `True`, convert float and int collisions to numeric.
697
712
 
713
+ coerce_timezone: bool, default True
714
+ If `True`, convert datetimes to UTC.
715
+
698
716
  debug: bool, default False
699
717
  Verbosity toggle.
700
718
 
@@ -703,20 +721,15 @@ def enforce_dtypes(
703
721
  The Pandas DataFrame with the types enforced.
704
722
  """
705
723
  import json
706
- import traceback
707
- from decimal import Decimal
708
724
  from meerschaum.utils.debug import dprint
709
- from meerschaum.utils.warnings import warn
710
725
  from meerschaum.utils.formatting import pprint
711
- from meerschaum.config.static import STATIC_CONFIG
712
- from meerschaum.utils.packages import import_pandas
713
726
  from meerschaum.utils.dtypes import (
714
727
  are_dtypes_equal,
715
728
  to_pandas_dtype,
716
729
  is_dtype_numeric,
717
730
  attempt_cast_to_numeric,
718
731
  attempt_cast_to_uuid,
719
- coerce_timezone,
732
+ coerce_timezone as _coerce_timezone,
720
733
  )
721
734
  if safe_copy:
722
735
  df = df.copy()
@@ -744,6 +757,11 @@ def enforce_dtypes(
744
757
  for col, typ in dtypes.items()
745
758
  if typ == 'uuid'
746
759
  ]
760
+ datetime_cols = [
761
+ col
762
+ for col, typ in dtypes.items()
763
+ if are_dtypes_equal(typ, 'datetime')
764
+ ]
747
765
  df_numeric_cols = get_numeric_cols(df)
748
766
  if debug:
749
767
  dprint("Desired data types:")
@@ -792,6 +810,12 @@ def enforce_dtypes(
792
810
  if debug:
793
811
  dprint(f"Unable to parse column '{col}' as UUID:\n{e}")
794
812
 
813
+ if datetime_cols and coerce_timezone:
814
+ if debug:
815
+ dprint(f"Checking for datetime conversion: {datetime_cols}")
816
+ for col in datetime_cols:
817
+ df[col] = _coerce_timezone(df[col])
818
+
795
819
  df_dtypes = {c: str(t) for c, t in df.dtypes.items()}
796
820
  if are_dtypes_equal(df_dtypes, pipe_pandas_dtypes):
797
821
  if debug:
@@ -826,8 +850,7 @@ def enforce_dtypes(
826
850
  if debug:
827
851
  dprint(
828
852
  "The incoming DataFrame has mostly the same types, skipping enforcement."
829
- + "The only detected difference was in the following datetime columns.\n"
830
- + " Timezone information may be stripped."
853
+ + "The only detected difference was in the following datetime columns."
831
854
  )
832
855
  pprint(detected_dt_cols)
833
856
  return df
@@ -930,11 +953,15 @@ def get_datetime_bound_from_df(
930
953
  if datetime_column not in df.columns:
931
954
  return None
932
955
 
933
- dt_val = (
934
- df[datetime_column].min(skipna=True)
935
- if minimum else df[datetime_column].max(skipna=True)
936
- )
937
- if is_dask and dt_val is not None:
956
+ try:
957
+ dt_val = (
958
+ df[datetime_column].min(skipna=True)
959
+ if minimum
960
+ else df[datetime_column].max(skipna=True)
961
+ )
962
+ except Exception:
963
+ dt_val = pandas.NA
964
+ if is_dask and dt_val is not None and dt_val is not pandas.NA:
938
965
  dt_val = dt_val.compute()
939
966
 
940
967
  return (
@@ -1243,12 +1270,12 @@ def query_df(
1243
1270
  end_tz = end.tzinfo if end is not None else None
1244
1271
 
1245
1272
  if begin_tz is not None or end_tz is not None or df_tz is not None:
1246
- begin = coerce_timezone(begin)
1247
- end = coerce_timezone(end)
1273
+ begin = coerce_timezone(begin, strip_utc=False)
1274
+ end = coerce_timezone(end, strip_utc=False)
1248
1275
  if df_tz is not None:
1249
1276
  if debug:
1250
1277
  dprint(f"Casting column '{datetime_column}' to UTC...")
1251
- df[datetime_column] = coerce_timezone(df[datetime_column])
1278
+ df[datetime_column] = coerce_timezone(df[datetime_column], strip_utc=False)
1252
1279
  dprint(f"Using datetime bounds:\n{begin=}\n{end=}")
1253
1280
 
1254
1281
  in_ex_params = get_in_ex_params(params)
@@ -19,7 +19,7 @@ MRSM_PD_DTYPES: Dict[str, str] = {
19
19
  'json': 'object',
20
20
  'numeric': 'object',
21
21
  'uuid': 'object',
22
- 'datetime': 'datetime64[ns]',
22
+ 'datetime': 'datetime64[ns, UTC]',
23
23
  'bool': 'bool[pyarrow]',
24
24
  'int': 'Int64',
25
25
  'int8': 'Int8',
@@ -245,7 +245,10 @@ def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal:
245
245
  return x
246
246
 
247
247
 
248
- def coerce_timezone(dt: Any) -> Any:
248
+ def coerce_timezone(
249
+ dt: Any,
250
+ strip_utc: bool = False,
251
+ ) -> Any:
249
252
  """
250
253
  Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
251
254
  return a naive datetime in terms of UTC.
@@ -260,9 +263,17 @@ def coerce_timezone(dt: Any) -> Any:
260
263
 
261
264
  if dt_is_series:
262
265
  pandas = mrsm.attempt_import('pandas')
263
- return pandas.to_datetime(dt, utc=True).apply(lambda x: x.replace(tzinfo=None))
266
+ dt_series = (
267
+ pandas.to_datetime(dt, utc=True)
268
+ )
269
+ if strip_utc:
270
+ dt_series = dt_series.apply(lambda x: x.replace(tzinfo=None))
271
+
272
+ return dt_series
264
273
 
265
274
  if dt.tzinfo is None:
266
- return dt
275
+ if strip_utc:
276
+ return dt
277
+ return dt.replace(tzinfo=timezone.utc)
267
278
 
268
- return dt.astimezone(timezone.utc).replace(tzinfo=None)
279
+ return dt.astimezone(timezone.utc)
@@ -7,7 +7,7 @@ Utility functions for working with SQL data types.
7
7
  """
8
8
 
9
9
  from __future__ import annotations
10
- from meerschaum.utils.typing import Dict, Union, Tuple
10
+ from meerschaum.utils.typing import Dict, Union, Tuple, List
11
11
 
12
12
  NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
13
13
  'mariadb': (38, 20),
@@ -16,6 +16,7 @@ NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
16
16
  'duckdb': (15, 4),
17
17
  'sqlite': (15, 4),
18
18
  }
19
+ TIMEZONE_NAIVE_FLAVORS = {'oracle', 'mysql', 'mariadb'}
19
20
 
20
21
  ### MySQL doesn't allow for casting as BIGINT, so this is a workaround.
21
22
  DB_FLAVORS_CAST_DTYPES = {
@@ -56,6 +57,7 @@ DB_FLAVORS_CAST_DTYPES = {
56
57
  'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)',
57
58
  'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)',
58
59
  'NVARCHAR': 'NVARCHAR(MAX)',
60
+ 'BIT': 'INT',
59
61
  },
60
62
  }
61
63
  for _flavor, (_precision, _scale) in NUMERIC_PRECISION_FLAVORS.items():
@@ -78,7 +80,9 @@ DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
78
80
  'NUMBER': 'numeric',
79
81
  'NUMERIC': 'numeric',
80
82
  'TIMESTAMP': 'datetime64[ns]',
83
+ 'TIMESTAMP WITHOUT TIMEZONE': 'datetime64[ns]',
81
84
  'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]',
85
+ 'TIMESTAMP WITH TIME ZONE': 'datetime64[ns, UTC]',
82
86
  'TIMESTAMPTZ': 'datetime64[ns, UTC]',
83
87
  'DATE': 'datetime64[ns]',
84
88
  'DATETIME': 'datetime64[ns]',
@@ -160,7 +164,7 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
160
164
  'mariadb': 'DATETIME',
161
165
  'mysql': 'DATETIME',
162
166
  'mssql': 'DATETIME2',
163
- 'oracle': 'DATE',
167
+ 'oracle': 'TIMESTAMP',
164
168
  'sqlite': 'DATETIME',
165
169
  'duckdb': 'TIMESTAMP',
166
170
  'citus': 'TIMESTAMP',
@@ -168,16 +172,16 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
168
172
  'default': 'DATETIME',
169
173
  },
170
174
  'datetime64[ns, UTC]': {
171
- 'timescaledb': 'TIMESTAMP',
172
- 'postgresql': 'TIMESTAMP',
173
- 'mariadb': 'TIMESTAMP',
174
- 'mysql': 'TIMESTAMP',
175
+ 'timescaledb': 'TIMESTAMPTZ',
176
+ 'postgresql': 'TIMESTAMPTZ',
177
+ 'mariadb': 'TIMESTAMP WITH TIME ZONE',
178
+ 'mysql': 'DATETIME',
175
179
  'mssql': 'DATETIMEOFFSET',
176
180
  'oracle': 'TIMESTAMP',
177
181
  'sqlite': 'TIMESTAMP',
178
- 'duckdb': 'TIMESTAMP',
179
- 'citus': 'TIMESTAMP',
180
- 'cockroachdb': 'TIMESTAMP',
182
+ 'duckdb': 'TIMESTAMPTZ',
183
+ 'citus': 'TIMESTAMPTZ',
184
+ 'cockroachdb': 'TIMESTAMPTZ',
181
185
  'default': 'TIMESTAMP',
182
186
  },
183
187
  'bool': {
@@ -185,7 +189,7 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
185
189
  'postgresql': 'BOOLEAN',
186
190
  'mariadb': 'BOOLEAN',
187
191
  'mysql': 'BOOLEAN',
188
- 'mssql': 'INTEGER',
192
+ 'mssql': 'BIT',
189
193
  'oracle': 'INTEGER',
190
194
  'sqlite': 'FLOAT',
191
195
  'duckdb': 'BOOLEAN',
@@ -301,24 +305,24 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
301
305
  'default': 'DateTime',
302
306
  },
303
307
  'datetime64[ns, UTC]': {
304
- 'timescaledb': 'DateTime',
305
- 'postgresql': 'DateTime',
306
- 'mariadb': 'DateTime',
307
- 'mysql': 'DateTime',
308
+ 'timescaledb': 'DateTime(timezone=True)',
309
+ 'postgresql': 'DateTime(timezone=True)',
310
+ 'mariadb': 'DateTime(timezone=True)',
311
+ 'mysql': 'DateTime(timezone=True)',
308
312
  'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET',
309
- 'oracle': 'DateTime',
310
- 'sqlite': 'DateTime',
311
- 'duckdb': 'DateTime',
312
- 'citus': 'DateTime',
313
- 'cockroachdb': 'DateTime',
314
- 'default': 'DateTime',
313
+ 'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)',
314
+ 'sqlite': 'DateTime(timezone=True)',
315
+ 'duckdb': 'DateTime(timezone=True)',
316
+ 'citus': 'DateTime(timezone=True)',
317
+ 'cockroachdb': 'DateTime(timezone=True)',
318
+ 'default': 'DateTime(timezone=True)',
315
319
  },
316
320
  'bool': {
317
321
  'timescaledb': 'Boolean',
318
322
  'postgresql': 'Boolean',
319
323
  'mariadb': 'Integer',
320
324
  'mysql': 'Integer',
321
- 'mssql': 'Integer',
325
+ 'mssql': 'sqlalchemy.dialects.mssql.BIT',
322
326
  'oracle': 'Integer',
323
327
  'sqlite': 'Float',
324
328
  'duckdb': 'Boolean',
@@ -393,6 +397,20 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
393
397
  },
394
398
  }
395
399
 
400
+ AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = {
401
+ 'timescaledb': 'GENERATED BY DEFAULT AS IDENTITY',
402
+ 'postgresql': 'GENERATED BY DEFAULT AS IDENTITY',
403
+ 'mariadb': 'AUTO_INCREMENT',
404
+ 'mysql': 'AUTO_INCREMENT',
405
+ 'mssql': 'IDENTITY(1,1)',
406
+ 'oracle': 'GENERATED BY DEFAULT ON NULL AS IDENTITY',
407
+ 'sqlite': 'AUTOINCREMENT',
408
+ 'duckdb': 'GENERATED BY DEFAULT',
409
+ 'citus': 'GENERATED BY DEFAULT',
410
+ 'cockroachdb': 'GENERATED BY DEFAULT AS IDENTITY',
411
+ 'default': 'GENERATED BY DEFAULT AS IDENTITY',
412
+ }
413
+
396
414
 
397
415
  def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
398
416
  """
@@ -456,10 +474,10 @@ def get_db_type_from_pd_type(
456
474
  The database data type for the incoming Pandas data type.
457
475
  If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
458
476
  """
459
- import ast
460
477
  from meerschaum.utils.warnings import warn
461
478
  from meerschaum.utils.packages import attempt_import
462
479
  from meerschaum.utils.dtypes import are_dtypes_equal
480
+ from meerschaum.utils.misc import parse_arguments_str
463
481
  sqlalchemy_types = attempt_import('sqlalchemy.types')
464
482
 
465
483
  types_registry = (
@@ -512,15 +530,16 @@ def get_db_type_from_pd_type(
512
530
 
513
531
  if db_type.startswith('sqlalchemy.dialects'):
514
532
  dialect, typ_class_name = db_type.replace('sqlalchemy.dialects.', '').split('.', maxsplit=2)
515
- arg = None
533
+ cls_args, cls_kwargs = None, None
516
534
  if '(' in typ_class_name:
517
- typ_class_name, arg_str = typ_class_name.split('(', maxsplit=1)
518
- arg = ast.literal_eval(arg_str.rstrip(')'))
535
+ typ_class_name, args_str = typ_class_name.split('(', maxsplit=1)
536
+ args_str = args_str.rstrip(')')
537
+ cls_args, cls_kwargs = parse_arguments_str(args_str)
519
538
  sqlalchemy_dialects_flavor_module = attempt_import(f'sqlalchemy.dialects.{dialect}')
520
539
  cls = getattr(sqlalchemy_dialects_flavor_module, typ_class_name)
521
- if arg is None:
540
+ if cls_args is None:
522
541
  return cls
523
- return cls(arg)
542
+ return cls(*cls_args, **cls_kwargs)
524
543
 
525
544
  if 'numeric' in db_type.lower():
526
545
  numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
@@ -528,4 +547,15 @@ def get_db_type_from_pd_type(
528
547
  return sqlalchemy_types.Numeric
529
548
  precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]
530
549
  return sqlalchemy_types.Numeric(precision, scale)
531
- return getattr(sqlalchemy_types, db_type)
550
+
551
+ cls_args, cls_kwargs = None, None
552
+ typ_class_name = db_type
553
+ if '(' in db_type:
554
+ typ_class_name, args_str = db_type.split('(', maxsplit=1)
555
+ args_str = args_str.rstrip(')')
556
+ cls_args, cls_kwargs = parse_arguments_str(args_str)
557
+
558
+ cls = getattr(sqlalchemy_types, typ_class_name)
559
+ if cls_args is None:
560
+ return cls
561
+ return cls(*cls_args, **cls_kwargs)