meerschaum 2.5.1__py3-none-any.whl → 2.6.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/_internal/arguments/_parser.py +6 -1
- meerschaum/actions/edit.py +6 -6
- meerschaum/actions/sql.py +12 -11
- meerschaum/config/_edit.py +46 -19
- meerschaum/config/_read_config.py +20 -9
- meerschaum/config/_version.py +1 -1
- meerschaum/config/stack/__init__.py +1 -1
- meerschaum/connectors/sql/_pipes.py +80 -24
- meerschaum/connectors/sql/_sql.py +29 -10
- meerschaum/connectors/valkey/_pipes.py +1 -1
- meerschaum/core/Pipe/__init__.py +8 -9
- meerschaum/core/Pipe/_attributes.py +33 -11
- meerschaum/core/Pipe/_data.py +26 -7
- meerschaum/core/Pipe/_dtypes.py +4 -4
- meerschaum/core/Pipe/_fetch.py +1 -1
- meerschaum/core/Pipe/_sync.py +16 -4
- meerschaum/core/Pipe/_verify.py +1 -1
- meerschaum/utils/dataframe.py +56 -29
- meerschaum/utils/dtypes/__init__.py +16 -5
- meerschaum/utils/dtypes/sql.py +58 -28
- meerschaum/utils/misc.py +49 -16
- meerschaum/utils/sql.py +224 -40
- {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/METADATA +1 -1
- {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/RECORD +30 -30
- {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/WHEEL +1 -1
- {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/LICENSE +0 -0
- {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/NOTICE +0 -0
- {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/top_level.txt +0 -0
- {meerschaum-2.5.1.dist-info → meerschaum-2.6.0.dev1.dist-info}/zip-safe +0 -0
@@ -103,10 +103,25 @@ def indices(self) -> Union[Dict[str, Union[str, List[str]]], None]:
|
|
103
103
|
if indices_key not in self.parameters:
|
104
104
|
self.parameters[indices_key] = {}
|
105
105
|
_indices = self.parameters[indices_key]
|
106
|
+
_columns = self.columns
|
107
|
+
dt_col = _columns.get('datetime', None)
|
106
108
|
if not isinstance(_indices, dict):
|
107
109
|
_indices = {}
|
108
110
|
self.parameters[indices_key] = _indices
|
109
|
-
|
111
|
+
unique_cols = (
|
112
|
+
[dt_col]
|
113
|
+
if dt_col
|
114
|
+
else []
|
115
|
+
) + [
|
116
|
+
col
|
117
|
+
for col_ix, col in _columns.items()
|
118
|
+
if col_ix != 'datetime'
|
119
|
+
]
|
120
|
+
return {
|
121
|
+
**({'unique': unique_cols} if len(unique_cols) > 1 else {}),
|
122
|
+
**_columns,
|
123
|
+
**_indices
|
124
|
+
}
|
110
125
|
|
111
126
|
|
112
127
|
@property
|
@@ -196,7 +211,7 @@ def get_columns(self, *args: str, error: bool = False) -> Union[str, Tuple[str]]
|
|
196
211
|
----------
|
197
212
|
*args: str
|
198
213
|
The column names to be retrieved.
|
199
|
-
|
214
|
+
|
200
215
|
error: bool, default False
|
201
216
|
If `True`, raise an `Exception` if the specified column is not defined.
|
202
217
|
|
@@ -509,15 +524,22 @@ def get_indices(self) -> Dict[str, str]:
|
|
509
524
|
if cols
|
510
525
|
}
|
511
526
|
_index_names = {
|
512
|
-
ix: (
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
location_key=self.location_key,
|
519
|
-
)
|
527
|
+
ix: _index_template.format(
|
528
|
+
target=_target,
|
529
|
+
column_names=column_names,
|
530
|
+
connector_keys=self.connector_keys,
|
531
|
+
metric_key=self.connector_key,
|
532
|
+
location_key=self.location_key,
|
520
533
|
)
|
521
534
|
for ix, column_names in _column_names.items()
|
522
535
|
}
|
523
|
-
|
536
|
+
### NOTE: Skip any duplicate indices.
|
537
|
+
seen_index_names = {}
|
538
|
+
for ix, index_name in _index_names.items():
|
539
|
+
if index_name in seen_index_names:
|
540
|
+
continue
|
541
|
+
seen_index_names[index_name] = ix
|
542
|
+
return {
|
543
|
+
ix: index_name
|
544
|
+
for index_name, ix in seen_index_names.items()
|
545
|
+
}
|
meerschaum/core/Pipe/_data.py
CHANGED
@@ -23,8 +23,8 @@ def get_data(
|
|
23
23
|
self,
|
24
24
|
select_columns: Optional[List[str]] = None,
|
25
25
|
omit_columns: Optional[List[str]] = None,
|
26
|
-
begin: Union[datetime, int, None] = None,
|
27
|
-
end: Union[datetime, int, None] = None,
|
26
|
+
begin: Union[datetime, int, str, None] = None,
|
27
|
+
end: Union[datetime, int, str, None] = None,
|
28
28
|
params: Optional[Dict[str, Any]] = None,
|
29
29
|
as_iterator: bool = False,
|
30
30
|
as_chunks: bool = False,
|
@@ -48,12 +48,12 @@ def get_data(
|
|
48
48
|
omit_columns: Optional[List[str]], default None
|
49
49
|
If provided, remove these columns from the selection.
|
50
50
|
|
51
|
-
begin: Union[datetime, int, None], default None
|
51
|
+
begin: Union[datetime, int, str, None], default None
|
52
52
|
Lower bound datetime to begin searching for data (inclusive).
|
53
53
|
Translates to a `WHERE` clause like `WHERE datetime >= begin`.
|
54
54
|
Defaults to `None`.
|
55
55
|
|
56
|
-
end: Union[datetime, int, None], default None
|
56
|
+
end: Union[datetime, int, str, None], default None
|
57
57
|
Upper bound datetime to stop searching for data (inclusive).
|
58
58
|
Translates to a `WHERE` clause like `WHERE datetime < end`.
|
59
59
|
Defaults to `None`.
|
@@ -105,11 +105,12 @@ def get_data(
|
|
105
105
|
from meerschaum.utils.venv import Venv
|
106
106
|
from meerschaum.connectors import get_connector_plugin
|
107
107
|
from meerschaum.utils.misc import iterate_chunks, items_str
|
108
|
-
from meerschaum.utils.dtypes import to_pandas_dtype
|
108
|
+
from meerschaum.utils.dtypes import to_pandas_dtype, coerce_timezone
|
109
109
|
from meerschaum.utils.dataframe import add_missing_cols_to_df, df_is_chunk_generator
|
110
110
|
from meerschaum.utils.packages import attempt_import
|
111
111
|
dd = attempt_import('dask.dataframe') if as_dask else None
|
112
112
|
dask = attempt_import('dask') if as_dask else None
|
113
|
+
dateutil_parser = attempt_import('dateutil.parser')
|
113
114
|
|
114
115
|
if select_columns == '*':
|
115
116
|
select_columns = None
|
@@ -120,11 +121,29 @@ def get_data(
|
|
120
121
|
omit_columns = [omit_columns]
|
121
122
|
|
122
123
|
as_iterator = as_iterator or as_chunks
|
124
|
+
dt_col = self.columns.get('datetime', None)
|
125
|
+
dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
126
|
+
dt_is_utc = 'utc' in dt_typ.lower()
|
127
|
+
if isinstance(begin, str):
|
128
|
+
try:
|
129
|
+
begin = dateutil_parser.parse(begin)
|
130
|
+
except Exception as e:
|
131
|
+
warn(f"Failed to parse '{begin}' as datetime:\n{e}")
|
132
|
+
begin = None
|
133
|
+
if isinstance(end, str):
|
134
|
+
try:
|
135
|
+
end = dateutil_parser.parse(end)
|
136
|
+
except Exception as e:
|
137
|
+
warn(f"Failed to parse '{end}' as datetime:\n{e}")
|
138
|
+
end = None
|
139
|
+
if isinstance(begin, datetime):
|
140
|
+
begin = coerce_timezone(begin, strip_utc=(not dt_is_utc))
|
141
|
+
if isinstance(end, datetime):
|
142
|
+
end = coerce_timezone(end, strip_utc=(not dt_is_utc))
|
123
143
|
|
124
144
|
def _sort_df(_df):
|
125
145
|
if df_is_chunk_generator(_df):
|
126
146
|
return _df
|
127
|
-
dt_col = self.columns.get('datetime', None)
|
128
147
|
indices = [] if dt_col not in _df.columns else [dt_col]
|
129
148
|
non_dt_cols = [
|
130
149
|
col
|
@@ -607,7 +626,7 @@ def get_chunk_interval(
|
|
607
626
|
if dt_col is None:
|
608
627
|
return timedelta(minutes=chunk_minutes)
|
609
628
|
|
610
|
-
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
|
629
|
+
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
611
630
|
if 'int' in dt_dtype.lower():
|
612
631
|
return chunk_minutes
|
613
632
|
return timedelta(minutes=chunk_minutes)
|
meerschaum/core/Pipe/_dtypes.py
CHANGED
@@ -101,18 +101,18 @@ def infer_dtypes(self, persist: bool = False, debug: bool = False) -> Dict[str,
|
|
101
101
|
dt_col = self.columns.get('datetime', None)
|
102
102
|
if dt_col:
|
103
103
|
if not self.parameters.get('dtypes', {}).get(dt_col, None):
|
104
|
-
dtypes[dt_col] = 'datetime64[ns]'
|
104
|
+
dtypes[dt_col] = 'datetime64[ns, UTC]'
|
105
105
|
return dtypes
|
106
106
|
|
107
|
-
from meerschaum.utils.sql import
|
108
|
-
from meerschaum.utils.
|
107
|
+
from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
|
108
|
+
from meerschaum.utils.dtypes import to_pandas_dtype
|
109
109
|
columns_types = self.get_columns_types(debug=debug)
|
110
110
|
|
111
111
|
### NOTE: get_columns_types() may return either the types as
|
112
112
|
### PostgreSQL- or Pandas-style.
|
113
113
|
dtypes = {
|
114
114
|
c: (
|
115
|
-
|
115
|
+
get_pd_type_from_db_type(t, allow_custom_dtypes=True)
|
116
116
|
if str(t).isupper()
|
117
117
|
else to_pandas_dtype(t)
|
118
118
|
)
|
meerschaum/core/Pipe/_fetch.py
CHANGED
@@ -125,7 +125,7 @@ def get_backtrack_interval(
|
|
125
125
|
if dt_col is None:
|
126
126
|
return backtrack_interval
|
127
127
|
|
128
|
-
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
|
128
|
+
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
129
129
|
if 'int' in dt_dtype.lower():
|
130
130
|
return backtrack_minutes
|
131
131
|
|
meerschaum/core/Pipe/_sync.py
CHANGED
@@ -624,6 +624,18 @@ def filter_existing(
|
|
624
624
|
merge = pd.merge
|
625
625
|
NA = pd.NA
|
626
626
|
|
627
|
+
primary_key = self.columns.get('primary', None)
|
628
|
+
autoincrement = self.parameters.get('autoincrement', False)
|
629
|
+
pipe_columns = self.columns.copy()
|
630
|
+
|
631
|
+
if primary_key and autoincrement and df is not None and primary_key in df.columns:
|
632
|
+
if safe_copy:
|
633
|
+
df = df.copy()
|
634
|
+
safe_copy = False
|
635
|
+
if df[primary_key].isnull().all():
|
636
|
+
del df[primary_key]
|
637
|
+
_ = self.columns.pop(primary_key, None)
|
638
|
+
|
627
639
|
def get_empty_df():
|
628
640
|
empty_df = pd.DataFrame([])
|
629
641
|
dtypes = dict(df.dtypes) if df is not None else {}
|
@@ -643,8 +655,8 @@ def filter_existing(
|
|
643
655
|
|
644
656
|
### begin is the oldest data in the new dataframe
|
645
657
|
begin, end = None, None
|
646
|
-
dt_col =
|
647
|
-
dt_type = self.dtypes.get(dt_col, 'datetime64[ns]') if dt_col else None
|
658
|
+
dt_col = pipe_columns.get('datetime', None)
|
659
|
+
dt_type = self.dtypes.get(dt_col, 'datetime64[ns, UTC]') if dt_col else None
|
648
660
|
try:
|
649
661
|
min_dt_val = df[dt_col].min(skipna=True) if dt_col else None
|
650
662
|
if is_dask and min_dt_val is not None:
|
@@ -713,7 +725,7 @@ def filter_existing(
|
|
713
725
|
|
714
726
|
unique_index_vals = {
|
715
727
|
col: df[col].unique()
|
716
|
-
for col in
|
728
|
+
for col in pipe_columns
|
717
729
|
if col in df.columns and col != dt_col
|
718
730
|
} if not date_bound_only else {}
|
719
731
|
filter_params_index_limit = get_config('pipes', 'sync', 'filter_params_index_limit')
|
@@ -749,7 +761,7 @@ def filter_existing(
|
|
749
761
|
|
750
762
|
### Separate new rows from changed ones.
|
751
763
|
on_cols = [
|
752
|
-
col for col_key, col in
|
764
|
+
col for col_key, col in pipe_columns.items()
|
753
765
|
if (
|
754
766
|
col
|
755
767
|
and
|
meerschaum/core/Pipe/_verify.py
CHANGED
@@ -394,7 +394,7 @@ def get_bound_interval(self, debug: bool = False) -> Union[timedelta, int, None]
|
|
394
394
|
if not dt_col:
|
395
395
|
return bound_time_value
|
396
396
|
|
397
|
-
dt_typ = self.dtypes.get(dt_col, 'datetime64[ns]')
|
397
|
+
dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
398
398
|
if 'int' in dt_typ.lower():
|
399
399
|
return int(bound_time_value)
|
400
400
|
|
meerschaum/utils/dataframe.py
CHANGED
@@ -94,14 +94,14 @@ def filter_unseen_df(
|
|
94
94
|
----------
|
95
95
|
old_df: 'pd.DataFrame'
|
96
96
|
The original (target) dataframe. Acts as a filter on the `new_df`.
|
97
|
-
|
97
|
+
|
98
98
|
new_df: 'pd.DataFrame'
|
99
99
|
The fetched (source) dataframe. Rows that are contained in `old_df` are removed.
|
100
100
|
|
101
101
|
safe_copy: bool, default True
|
102
102
|
If `True`, create a copy before comparing and modifying the dataframes.
|
103
103
|
Setting to `False` may mutate the DataFrames.
|
104
|
-
|
104
|
+
|
105
105
|
dtypes: Optional[Dict[str, Any]], default None
|
106
106
|
Optionally specify the datatypes of the dataframe.
|
107
107
|
|
@@ -234,8 +234,10 @@ def filter_unseen_df(
|
|
234
234
|
cast_dt_cols = True
|
235
235
|
try:
|
236
236
|
for col, typ in dt_dtypes.items():
|
237
|
-
|
238
|
-
|
237
|
+
if col in old_df.columns:
|
238
|
+
old_df[col] = coerce_timezone(pd.to_datetime(old_df[col], utc=True))
|
239
|
+
if col in new_df.columns:
|
240
|
+
new_df[col] = coerce_timezone(pd.to_datetime(new_df[col], utc=True))
|
239
241
|
cast_dt_cols = False
|
240
242
|
except Exception as e:
|
241
243
|
warn(f"Could not cast datetime columns:\n{e}")
|
@@ -363,6 +365,7 @@ def filter_unseen_df(
|
|
363
365
|
def parse_df_datetimes(
|
364
366
|
df: 'pd.DataFrame',
|
365
367
|
ignore_cols: Optional[Iterable[str]] = None,
|
368
|
+
strip_timezone: bool = True,
|
366
369
|
chunksize: Optional[int] = None,
|
367
370
|
dtype_backend: str = 'numpy_nullable',
|
368
371
|
debug: bool = False,
|
@@ -378,6 +381,9 @@ def parse_df_datetimes(
|
|
378
381
|
ignore_cols: Optional[Iterable[str]], default None
|
379
382
|
If provided, do not attempt to coerce these columns as datetimes.
|
380
383
|
|
384
|
+
strip_timezone: bool, default True
|
385
|
+
If `True`, remove the UTC `tzinfo` property.
|
386
|
+
|
381
387
|
chunksize: Optional[int], default None
|
382
388
|
If the pandas implementation is `'dask'`, use this chunksize for the distributed dataframe.
|
383
389
|
|
@@ -385,7 +391,7 @@ def parse_df_datetimes(
|
|
385
391
|
If `df` is not a DataFrame and new one needs to be constructed,
|
386
392
|
use this as the datatypes backend.
|
387
393
|
Accepted values are 'numpy_nullable' and 'pyarrow'.
|
388
|
-
|
394
|
+
|
389
395
|
debug: bool, default False
|
390
396
|
Verbosity toggle.
|
391
397
|
|
@@ -447,7 +453,7 @@ def parse_df_datetimes(
|
|
447
453
|
for doc in df
|
448
454
|
] for k in keys
|
449
455
|
},
|
450
|
-
npartitions
|
456
|
+
npartitions=npartitions,
|
451
457
|
)
|
452
458
|
elif isinstance(df, dict):
|
453
459
|
df = pd.DataFrame.from_dict(df, npartitions=npartitions)
|
@@ -500,14 +506,18 @@ def parse_df_datetimes(
|
|
500
506
|
|
501
507
|
try:
|
502
508
|
if not using_dask:
|
503
|
-
df[datetime_cols] = df[datetime_cols].apply(
|
509
|
+
df[datetime_cols] = df[datetime_cols].apply(
|
510
|
+
pd.to_datetime,
|
511
|
+
utc=True,
|
512
|
+
format='ISO8601',
|
513
|
+
)
|
504
514
|
else:
|
505
515
|
df[datetime_cols] = df[datetime_cols].apply(
|
506
516
|
pd.to_datetime,
|
507
517
|
utc=True,
|
508
518
|
axis=1,
|
509
519
|
meta={
|
510
|
-
col: 'datetime64[ns]'
|
520
|
+
col: 'datetime64[ns, UTC]'
|
511
521
|
for col in datetime_cols
|
512
522
|
}
|
513
523
|
)
|
@@ -517,11 +527,15 @@ def parse_df_datetimes(
|
|
517
527
|
+ f"{traceback.format_exc()}"
|
518
528
|
)
|
519
529
|
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
530
|
+
if strip_timezone:
|
531
|
+
for dt in datetime_cols:
|
532
|
+
try:
|
533
|
+
df[dt] = df[dt].dt.tz_localize(None)
|
534
|
+
except Exception:
|
535
|
+
warn(
|
536
|
+
f"Unable to convert column '{dt}' to naive datetime:\n"
|
537
|
+
+ f"{traceback.format_exc()}"
|
538
|
+
)
|
525
539
|
|
526
540
|
return df
|
527
541
|
|
@@ -674,6 +688,7 @@ def enforce_dtypes(
|
|
674
688
|
dtypes: Dict[str, str],
|
675
689
|
safe_copy: bool = True,
|
676
690
|
coerce_numeric: bool = True,
|
691
|
+
coerce_timezone: bool = True,
|
677
692
|
debug: bool = False,
|
678
693
|
) -> 'pd.DataFrame':
|
679
694
|
"""
|
@@ -695,6 +710,9 @@ def enforce_dtypes(
|
|
695
710
|
coerce_numeric: bool, default True
|
696
711
|
If `True`, convert float and int collisions to numeric.
|
697
712
|
|
713
|
+
coerce_timezone: bool, default True
|
714
|
+
If `True`, convert datetimes to UTC.
|
715
|
+
|
698
716
|
debug: bool, default False
|
699
717
|
Verbosity toggle.
|
700
718
|
|
@@ -703,20 +721,15 @@ def enforce_dtypes(
|
|
703
721
|
The Pandas DataFrame with the types enforced.
|
704
722
|
"""
|
705
723
|
import json
|
706
|
-
import traceback
|
707
|
-
from decimal import Decimal
|
708
724
|
from meerschaum.utils.debug import dprint
|
709
|
-
from meerschaum.utils.warnings import warn
|
710
725
|
from meerschaum.utils.formatting import pprint
|
711
|
-
from meerschaum.config.static import STATIC_CONFIG
|
712
|
-
from meerschaum.utils.packages import import_pandas
|
713
726
|
from meerschaum.utils.dtypes import (
|
714
727
|
are_dtypes_equal,
|
715
728
|
to_pandas_dtype,
|
716
729
|
is_dtype_numeric,
|
717
730
|
attempt_cast_to_numeric,
|
718
731
|
attempt_cast_to_uuid,
|
719
|
-
coerce_timezone,
|
732
|
+
coerce_timezone as _coerce_timezone,
|
720
733
|
)
|
721
734
|
if safe_copy:
|
722
735
|
df = df.copy()
|
@@ -744,6 +757,11 @@ def enforce_dtypes(
|
|
744
757
|
for col, typ in dtypes.items()
|
745
758
|
if typ == 'uuid'
|
746
759
|
]
|
760
|
+
datetime_cols = [
|
761
|
+
col
|
762
|
+
for col, typ in dtypes.items()
|
763
|
+
if are_dtypes_equal(typ, 'datetime')
|
764
|
+
]
|
747
765
|
df_numeric_cols = get_numeric_cols(df)
|
748
766
|
if debug:
|
749
767
|
dprint("Desired data types:")
|
@@ -792,6 +810,12 @@ def enforce_dtypes(
|
|
792
810
|
if debug:
|
793
811
|
dprint(f"Unable to parse column '{col}' as UUID:\n{e}")
|
794
812
|
|
813
|
+
if datetime_cols and coerce_timezone:
|
814
|
+
if debug:
|
815
|
+
dprint(f"Checking for datetime conversion: {datetime_cols}")
|
816
|
+
for col in datetime_cols:
|
817
|
+
df[col] = _coerce_timezone(df[col])
|
818
|
+
|
795
819
|
df_dtypes = {c: str(t) for c, t in df.dtypes.items()}
|
796
820
|
if are_dtypes_equal(df_dtypes, pipe_pandas_dtypes):
|
797
821
|
if debug:
|
@@ -826,8 +850,7 @@ def enforce_dtypes(
|
|
826
850
|
if debug:
|
827
851
|
dprint(
|
828
852
|
"The incoming DataFrame has mostly the same types, skipping enforcement."
|
829
|
-
+ "The only detected difference was in the following datetime columns
|
830
|
-
+ " Timezone information may be stripped."
|
853
|
+
+ "The only detected difference was in the following datetime columns."
|
831
854
|
)
|
832
855
|
pprint(detected_dt_cols)
|
833
856
|
return df
|
@@ -930,11 +953,15 @@ def get_datetime_bound_from_df(
|
|
930
953
|
if datetime_column not in df.columns:
|
931
954
|
return None
|
932
955
|
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
|
956
|
+
try:
|
957
|
+
dt_val = (
|
958
|
+
df[datetime_column].min(skipna=True)
|
959
|
+
if minimum
|
960
|
+
else df[datetime_column].max(skipna=True)
|
961
|
+
)
|
962
|
+
except Exception:
|
963
|
+
dt_val = pandas.NA
|
964
|
+
if is_dask and dt_val is not None and dt_val is not pandas.NA:
|
938
965
|
dt_val = dt_val.compute()
|
939
966
|
|
940
967
|
return (
|
@@ -1243,12 +1270,12 @@ def query_df(
|
|
1243
1270
|
end_tz = end.tzinfo if end is not None else None
|
1244
1271
|
|
1245
1272
|
if begin_tz is not None or end_tz is not None or df_tz is not None:
|
1246
|
-
begin = coerce_timezone(begin)
|
1247
|
-
end = coerce_timezone(end)
|
1273
|
+
begin = coerce_timezone(begin, strip_utc=False)
|
1274
|
+
end = coerce_timezone(end, strip_utc=False)
|
1248
1275
|
if df_tz is not None:
|
1249
1276
|
if debug:
|
1250
1277
|
dprint(f"Casting column '{datetime_column}' to UTC...")
|
1251
|
-
df[datetime_column] = coerce_timezone(df[datetime_column])
|
1278
|
+
df[datetime_column] = coerce_timezone(df[datetime_column], strip_utc=False)
|
1252
1279
|
dprint(f"Using datetime bounds:\n{begin=}\n{end=}")
|
1253
1280
|
|
1254
1281
|
in_ex_params = get_in_ex_params(params)
|
@@ -19,7 +19,7 @@ MRSM_PD_DTYPES: Dict[str, str] = {
|
|
19
19
|
'json': 'object',
|
20
20
|
'numeric': 'object',
|
21
21
|
'uuid': 'object',
|
22
|
-
'datetime': 'datetime64[ns]',
|
22
|
+
'datetime': 'datetime64[ns, UTC]',
|
23
23
|
'bool': 'bool[pyarrow]',
|
24
24
|
'int': 'Int64',
|
25
25
|
'int8': 'Int8',
|
@@ -245,7 +245,10 @@ def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal:
|
|
245
245
|
return x
|
246
246
|
|
247
247
|
|
248
|
-
def coerce_timezone(
|
248
|
+
def coerce_timezone(
|
249
|
+
dt: Any,
|
250
|
+
strip_utc: bool = False,
|
251
|
+
) -> Any:
|
249
252
|
"""
|
250
253
|
Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
|
251
254
|
return a naive datetime in terms of UTC.
|
@@ -260,9 +263,17 @@ def coerce_timezone(dt: Any) -> Any:
|
|
260
263
|
|
261
264
|
if dt_is_series:
|
262
265
|
pandas = mrsm.attempt_import('pandas')
|
263
|
-
|
266
|
+
dt_series = (
|
267
|
+
pandas.to_datetime(dt, utc=True)
|
268
|
+
)
|
269
|
+
if strip_utc:
|
270
|
+
dt_series = dt_series.apply(lambda x: x.replace(tzinfo=None))
|
271
|
+
|
272
|
+
return dt_series
|
264
273
|
|
265
274
|
if dt.tzinfo is None:
|
266
|
-
|
275
|
+
if strip_utc:
|
276
|
+
return dt
|
277
|
+
return dt.replace(tzinfo=timezone.utc)
|
267
278
|
|
268
|
-
return dt.astimezone(timezone.utc)
|
279
|
+
return dt.astimezone(timezone.utc)
|
meerschaum/utils/dtypes/sql.py
CHANGED
@@ -7,7 +7,7 @@ Utility functions for working with SQL data types.
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
|
-
from meerschaum.utils.typing import Dict, Union, Tuple
|
10
|
+
from meerschaum.utils.typing import Dict, Union, Tuple, List
|
11
11
|
|
12
12
|
NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
|
13
13
|
'mariadb': (38, 20),
|
@@ -16,6 +16,7 @@ NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
|
|
16
16
|
'duckdb': (15, 4),
|
17
17
|
'sqlite': (15, 4),
|
18
18
|
}
|
19
|
+
TIMEZONE_NAIVE_FLAVORS = {'oracle', 'mysql', 'mariadb'}
|
19
20
|
|
20
21
|
### MySQL doesn't allow for casting as BIGINT, so this is a workaround.
|
21
22
|
DB_FLAVORS_CAST_DTYPES = {
|
@@ -56,6 +57,7 @@ DB_FLAVORS_CAST_DTYPES = {
|
|
56
57
|
'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)',
|
57
58
|
'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)',
|
58
59
|
'NVARCHAR': 'NVARCHAR(MAX)',
|
60
|
+
'BIT': 'INT',
|
59
61
|
},
|
60
62
|
}
|
61
63
|
for _flavor, (_precision, _scale) in NUMERIC_PRECISION_FLAVORS.items():
|
@@ -78,7 +80,9 @@ DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
|
|
78
80
|
'NUMBER': 'numeric',
|
79
81
|
'NUMERIC': 'numeric',
|
80
82
|
'TIMESTAMP': 'datetime64[ns]',
|
83
|
+
'TIMESTAMP WITHOUT TIMEZONE': 'datetime64[ns]',
|
81
84
|
'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]',
|
85
|
+
'TIMESTAMP WITH TIME ZONE': 'datetime64[ns, UTC]',
|
82
86
|
'TIMESTAMPTZ': 'datetime64[ns, UTC]',
|
83
87
|
'DATE': 'datetime64[ns]',
|
84
88
|
'DATETIME': 'datetime64[ns]',
|
@@ -160,7 +164,7 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
160
164
|
'mariadb': 'DATETIME',
|
161
165
|
'mysql': 'DATETIME',
|
162
166
|
'mssql': 'DATETIME2',
|
163
|
-
'oracle': '
|
167
|
+
'oracle': 'TIMESTAMP',
|
164
168
|
'sqlite': 'DATETIME',
|
165
169
|
'duckdb': 'TIMESTAMP',
|
166
170
|
'citus': 'TIMESTAMP',
|
@@ -168,16 +172,16 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
168
172
|
'default': 'DATETIME',
|
169
173
|
},
|
170
174
|
'datetime64[ns, UTC]': {
|
171
|
-
'timescaledb': '
|
172
|
-
'postgresql': '
|
173
|
-
'mariadb': 'TIMESTAMP',
|
174
|
-
'mysql': '
|
175
|
+
'timescaledb': 'TIMESTAMPTZ',
|
176
|
+
'postgresql': 'TIMESTAMPTZ',
|
177
|
+
'mariadb': 'TIMESTAMP WITH TIME ZONE',
|
178
|
+
'mysql': 'DATETIME',
|
175
179
|
'mssql': 'DATETIMEOFFSET',
|
176
180
|
'oracle': 'TIMESTAMP',
|
177
181
|
'sqlite': 'TIMESTAMP',
|
178
|
-
'duckdb': '
|
179
|
-
'citus': '
|
180
|
-
'cockroachdb': '
|
182
|
+
'duckdb': 'TIMESTAMPTZ',
|
183
|
+
'citus': 'TIMESTAMPTZ',
|
184
|
+
'cockroachdb': 'TIMESTAMPTZ',
|
181
185
|
'default': 'TIMESTAMP',
|
182
186
|
},
|
183
187
|
'bool': {
|
@@ -185,7 +189,7 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
185
189
|
'postgresql': 'BOOLEAN',
|
186
190
|
'mariadb': 'BOOLEAN',
|
187
191
|
'mysql': 'BOOLEAN',
|
188
|
-
'mssql': '
|
192
|
+
'mssql': 'BIT',
|
189
193
|
'oracle': 'INTEGER',
|
190
194
|
'sqlite': 'FLOAT',
|
191
195
|
'duckdb': 'BOOLEAN',
|
@@ -301,24 +305,24 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
301
305
|
'default': 'DateTime',
|
302
306
|
},
|
303
307
|
'datetime64[ns, UTC]': {
|
304
|
-
'timescaledb': 'DateTime',
|
305
|
-
'postgresql': 'DateTime',
|
306
|
-
'mariadb': 'DateTime',
|
307
|
-
'mysql': 'DateTime',
|
308
|
+
'timescaledb': 'DateTime(timezone=True)',
|
309
|
+
'postgresql': 'DateTime(timezone=True)',
|
310
|
+
'mariadb': 'DateTime(timezone=True)',
|
311
|
+
'mysql': 'DateTime(timezone=True)',
|
308
312
|
'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET',
|
309
|
-
'oracle': '
|
310
|
-
'sqlite': 'DateTime',
|
311
|
-
'duckdb': 'DateTime',
|
312
|
-
'citus': 'DateTime',
|
313
|
-
'cockroachdb': 'DateTime',
|
314
|
-
'default': 'DateTime',
|
313
|
+
'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)',
|
314
|
+
'sqlite': 'DateTime(timezone=True)',
|
315
|
+
'duckdb': 'DateTime(timezone=True)',
|
316
|
+
'citus': 'DateTime(timezone=True)',
|
317
|
+
'cockroachdb': 'DateTime(timezone=True)',
|
318
|
+
'default': 'DateTime(timezone=True)',
|
315
319
|
},
|
316
320
|
'bool': {
|
317
321
|
'timescaledb': 'Boolean',
|
318
322
|
'postgresql': 'Boolean',
|
319
323
|
'mariadb': 'Integer',
|
320
324
|
'mysql': 'Integer',
|
321
|
-
'mssql': '
|
325
|
+
'mssql': 'sqlalchemy.dialects.mssql.BIT',
|
322
326
|
'oracle': 'Integer',
|
323
327
|
'sqlite': 'Float',
|
324
328
|
'duckdb': 'Boolean',
|
@@ -393,6 +397,20 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
393
397
|
},
|
394
398
|
}
|
395
399
|
|
400
|
+
AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = {
|
401
|
+
'timescaledb': 'GENERATED BY DEFAULT AS IDENTITY',
|
402
|
+
'postgresql': 'GENERATED BY DEFAULT AS IDENTITY',
|
403
|
+
'mariadb': 'AUTO_INCREMENT',
|
404
|
+
'mysql': 'AUTO_INCREMENT',
|
405
|
+
'mssql': 'IDENTITY(1,1)',
|
406
|
+
'oracle': 'GENERATED BY DEFAULT ON NULL AS IDENTITY',
|
407
|
+
'sqlite': 'AUTOINCREMENT',
|
408
|
+
'duckdb': 'GENERATED BY DEFAULT',
|
409
|
+
'citus': 'GENERATED BY DEFAULT',
|
410
|
+
'cockroachdb': 'GENERATED BY DEFAULT AS IDENTITY',
|
411
|
+
'default': 'GENERATED BY DEFAULT AS IDENTITY',
|
412
|
+
}
|
413
|
+
|
396
414
|
|
397
415
|
def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
|
398
416
|
"""
|
@@ -456,10 +474,10 @@ def get_db_type_from_pd_type(
|
|
456
474
|
The database data type for the incoming Pandas data type.
|
457
475
|
If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
|
458
476
|
"""
|
459
|
-
import ast
|
460
477
|
from meerschaum.utils.warnings import warn
|
461
478
|
from meerschaum.utils.packages import attempt_import
|
462
479
|
from meerschaum.utils.dtypes import are_dtypes_equal
|
480
|
+
from meerschaum.utils.misc import parse_arguments_str
|
463
481
|
sqlalchemy_types = attempt_import('sqlalchemy.types')
|
464
482
|
|
465
483
|
types_registry = (
|
@@ -512,15 +530,16 @@ def get_db_type_from_pd_type(
|
|
512
530
|
|
513
531
|
if db_type.startswith('sqlalchemy.dialects'):
|
514
532
|
dialect, typ_class_name = db_type.replace('sqlalchemy.dialects.', '').split('.', maxsplit=2)
|
515
|
-
|
533
|
+
cls_args, cls_kwargs = None, None
|
516
534
|
if '(' in typ_class_name:
|
517
|
-
typ_class_name,
|
518
|
-
|
535
|
+
typ_class_name, args_str = typ_class_name.split('(', maxsplit=1)
|
536
|
+
args_str = args_str.rstrip(')')
|
537
|
+
cls_args, cls_kwargs = parse_arguments_str(args_str)
|
519
538
|
sqlalchemy_dialects_flavor_module = attempt_import(f'sqlalchemy.dialects.{dialect}')
|
520
539
|
cls = getattr(sqlalchemy_dialects_flavor_module, typ_class_name)
|
521
|
-
if
|
540
|
+
if cls_args is None:
|
522
541
|
return cls
|
523
|
-
return cls(
|
542
|
+
return cls(*cls_args, **cls_kwargs)
|
524
543
|
|
525
544
|
if 'numeric' in db_type.lower():
|
526
545
|
numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
|
@@ -528,4 +547,15 @@ def get_db_type_from_pd_type(
|
|
528
547
|
return sqlalchemy_types.Numeric
|
529
548
|
precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]
|
530
549
|
return sqlalchemy_types.Numeric(precision, scale)
|
531
|
-
|
550
|
+
|
551
|
+
cls_args, cls_kwargs = None, None
|
552
|
+
typ_class_name = db_type
|
553
|
+
if '(' in db_type:
|
554
|
+
typ_class_name, args_str = db_type.split('(', maxsplit=1)
|
555
|
+
args_str = args_str.rstrip(')')
|
556
|
+
cls_args, cls_kwargs = parse_arguments_str(args_str)
|
557
|
+
|
558
|
+
cls = getattr(sqlalchemy_types, typ_class_name)
|
559
|
+
if cls_args is None:
|
560
|
+
return cls
|
561
|
+
return cls(*cls_args, **cls_kwargs)
|