meerschaum 2.6.17__py3-none-any.whl → 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/actions/delete.py +65 -69
- meerschaum/actions/install.py +1 -2
- meerschaum/api/routes/_pipes.py +7 -8
- meerschaum/config/_default.py +1 -1
- meerschaum/config/_paths.py +2 -1
- meerschaum/config/_version.py +1 -1
- meerschaum/connectors/api/_pipes.py +18 -21
- meerschaum/connectors/sql/_instance.py +11 -12
- meerschaum/connectors/sql/_pipes.py +122 -78
- meerschaum/connectors/sql/_sql.py +43 -8
- meerschaum/connectors/valkey/_pipes.py +12 -1
- meerschaum/core/Pipe/__init__.py +23 -13
- meerschaum/core/Pipe/_attributes.py +25 -1
- meerschaum/core/Pipe/_dtypes.py +23 -16
- meerschaum/core/Pipe/_sync.py +59 -31
- meerschaum/core/Pipe/_verify.py +8 -7
- meerschaum/jobs/_Job.py +2 -0
- meerschaum/plugins/_Plugin.py +11 -14
- meerschaum/utils/daemon/Daemon.py +20 -13
- meerschaum/utils/dataframe.py +178 -16
- meerschaum/utils/dtypes/__init__.py +149 -14
- meerschaum/utils/dtypes/sql.py +41 -7
- meerschaum/utils/misc.py +8 -8
- meerschaum/utils/sql.py +174 -64
- meerschaum/utils/venv/_Venv.py +4 -4
- meerschaum/utils/venv/__init__.py +53 -20
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/METADATA +1 -1
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/RECORD +34 -34
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/LICENSE +0 -0
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/NOTICE +0 -0
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/WHEEL +0 -0
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/top_level.txt +0 -0
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/zip-safe +0 -0
meerschaum/utils/dataframe.py
CHANGED
@@ -139,7 +139,6 @@ def filter_unseen_df(
|
|
139
139
|
import functools
|
140
140
|
import traceback
|
141
141
|
from decimal import Decimal
|
142
|
-
from uuid import UUID
|
143
142
|
from meerschaum.utils.warnings import warn
|
144
143
|
from meerschaum.utils.packages import import_pandas, attempt_import
|
145
144
|
from meerschaum.utils.dtypes import (
|
@@ -147,6 +146,7 @@ def filter_unseen_df(
|
|
147
146
|
are_dtypes_equal,
|
148
147
|
attempt_cast_to_numeric,
|
149
148
|
attempt_cast_to_uuid,
|
149
|
+
attempt_cast_to_bytes,
|
150
150
|
coerce_timezone,
|
151
151
|
)
|
152
152
|
pd = import_pandas(debug=debug)
|
@@ -333,6 +333,11 @@ def filter_unseen_df(
|
|
333
333
|
old_uuid_cols = get_uuid_cols(old_df)
|
334
334
|
new_uuid_cols = get_uuid_cols(new_df)
|
335
335
|
uuid_cols = set(new_uuid_cols + old_uuid_cols)
|
336
|
+
|
337
|
+
old_bytes_cols = get_bytes_cols(old_df)
|
338
|
+
new_bytes_cols = get_bytes_cols(new_df)
|
339
|
+
bytes_cols = set(new_bytes_cols + old_bytes_cols)
|
340
|
+
|
336
341
|
joined_df = merge(
|
337
342
|
new_df.infer_objects(copy=False).fillna(NA),
|
338
343
|
old_df.infer_objects(copy=False).fillna(NA),
|
@@ -368,6 +373,14 @@ def filter_unseen_df(
|
|
368
373
|
except Exception:
|
369
374
|
warn(f"Unable to parse numeric column '{uuid_col}':\n{traceback.format_exc()}")
|
370
375
|
|
376
|
+
for bytes_col in bytes_cols:
|
377
|
+
if bytes_col not in delta_df.columns:
|
378
|
+
continue
|
379
|
+
try:
|
380
|
+
delta_df[bytes_col] = delta_df[bytes_col].apply(attempt_cast_to_bytes)
|
381
|
+
except Exception:
|
382
|
+
warn(f"Unable to parse bytes column '{bytes_col}':\n{traceback.format_exc()}")
|
383
|
+
|
371
384
|
return delta_df
|
372
385
|
|
373
386
|
|
@@ -429,6 +442,7 @@ def parse_df_datetimes(
|
|
429
442
|
from meerschaum.utils.debug import dprint
|
430
443
|
from meerschaum.utils.warnings import warn
|
431
444
|
from meerschaum.utils.misc import items_str
|
445
|
+
from meerschaum.utils.dtypes import to_datetime
|
432
446
|
import traceback
|
433
447
|
pd = import_pandas()
|
434
448
|
pandas = attempt_import('pandas')
|
@@ -480,7 +494,7 @@ def parse_df_datetimes(
|
|
480
494
|
### skip parsing if DataFrame is empty
|
481
495
|
if len(pdf) == 0:
|
482
496
|
if debug:
|
483
|
-
dprint(
|
497
|
+
dprint("df is empty. Returning original DataFrame without casting datetime columns...")
|
484
498
|
return df
|
485
499
|
|
486
500
|
ignore_cols = set(
|
@@ -494,8 +508,8 @@ def parse_df_datetimes(
|
|
494
508
|
|
495
509
|
if len(cols_to_inspect) == 0:
|
496
510
|
if debug:
|
497
|
-
dprint(
|
498
|
-
return df.fillna(pandas.NA)
|
511
|
+
dprint("All columns are ignored, skipping datetime detection...")
|
512
|
+
return df.infer_objects(copy=False).fillna(pandas.NA)
|
499
513
|
|
500
514
|
### apply regex to columns to determine which are ISO datetimes
|
501
515
|
iso_dt_regex = r'\d{4}-\d{2}-\d{2}.\d{2}\:\d{2}\:\d+'
|
@@ -508,21 +522,17 @@ def parse_df_datetimes(
|
|
508
522
|
if not datetime_cols:
|
509
523
|
if debug:
|
510
524
|
dprint("No columns detected as datetimes, returning...")
|
511
|
-
return df.fillna(pandas.NA)
|
525
|
+
return df.infer_objects(copy=False).fillna(pandas.NA)
|
512
526
|
|
513
527
|
if debug:
|
514
528
|
dprint("Converting columns to datetimes: " + str(datetime_cols))
|
515
529
|
|
516
530
|
try:
|
517
531
|
if not using_dask:
|
518
|
-
df[datetime_cols] = df[datetime_cols].apply(
|
519
|
-
pd.to_datetime,
|
520
|
-
utc=True,
|
521
|
-
format='ISO8601',
|
522
|
-
)
|
532
|
+
df[datetime_cols] = df[datetime_cols].apply(to_datetime)
|
523
533
|
else:
|
524
534
|
df[datetime_cols] = df[datetime_cols].apply(
|
525
|
-
|
535
|
+
to_datetime,
|
526
536
|
utc=True,
|
527
537
|
axis=1,
|
528
538
|
meta={
|
@@ -665,7 +675,7 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
|
|
665
675
|
|
666
676
|
Returns
|
667
677
|
-------
|
668
|
-
A list of columns to treat as
|
678
|
+
A list of columns to treat as UUIDs.
|
669
679
|
"""
|
670
680
|
if df is None:
|
671
681
|
return []
|
@@ -692,6 +702,135 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
|
|
692
702
|
]
|
693
703
|
|
694
704
|
|
705
|
+
def get_datetime_cols(
|
706
|
+
df: 'pd.DataFrame',
|
707
|
+
timezone_aware: bool = True,
|
708
|
+
timezone_naive: bool = True,
|
709
|
+
) -> List[str]:
|
710
|
+
"""
|
711
|
+
Get the columns which contain `datetime` or `Timestamp` objects from a Pandas DataFrame.
|
712
|
+
|
713
|
+
Parameters
|
714
|
+
----------
|
715
|
+
df: pd.DataFrame
|
716
|
+
The DataFrame which may contain `datetime` or `Timestamp` objects.
|
717
|
+
|
718
|
+
timezone_aware: bool, default True
|
719
|
+
If `True`, include timezone-aware datetime columns.
|
720
|
+
|
721
|
+
timezone_naive: bool, default True
|
722
|
+
If `True`, include timezone-naive datetime columns.
|
723
|
+
|
724
|
+
Returns
|
725
|
+
-------
|
726
|
+
A list of columns to treat as datetimes.
|
727
|
+
"""
|
728
|
+
if not timezone_aware and not timezone_naive:
|
729
|
+
raise ValueError("`timezone_aware` and `timezone_naive` cannot both be `False`.")
|
730
|
+
|
731
|
+
if df is None:
|
732
|
+
return []
|
733
|
+
|
734
|
+
from datetime import datetime
|
735
|
+
from meerschaum.utils.dtypes import are_dtypes_equal
|
736
|
+
is_dask = 'dask' in df.__module__
|
737
|
+
if is_dask:
|
738
|
+
df = get_first_valid_dask_partition(df)
|
739
|
+
|
740
|
+
known_dt_cols = [
|
741
|
+
col
|
742
|
+
for col, typ in df.dtypes.items()
|
743
|
+
if are_dtypes_equal('datetime', str(typ))
|
744
|
+
]
|
745
|
+
|
746
|
+
if len(df) == 0:
|
747
|
+
return known_dt_cols
|
748
|
+
|
749
|
+
cols_indices = {
|
750
|
+
col: df[col].first_valid_index()
|
751
|
+
for col in df.columns
|
752
|
+
if col not in known_dt_cols
|
753
|
+
}
|
754
|
+
pydt_cols = [
|
755
|
+
col
|
756
|
+
for col, ix in cols_indices.items()
|
757
|
+
if (
|
758
|
+
ix is not None
|
759
|
+
and
|
760
|
+
isinstance(df.loc[ix][col], datetime)
|
761
|
+
)
|
762
|
+
]
|
763
|
+
dt_cols_set = set(known_dt_cols + pydt_cols)
|
764
|
+
all_dt_cols = [
|
765
|
+
col
|
766
|
+
for col in df.columns
|
767
|
+
if col in dt_cols_set
|
768
|
+
]
|
769
|
+
if timezone_aware and timezone_naive:
|
770
|
+
return all_dt_cols
|
771
|
+
|
772
|
+
known_timezone_aware_dt_cols = [
|
773
|
+
col
|
774
|
+
for col in known_dt_cols
|
775
|
+
if getattr(df[col], 'tz', None) is not None
|
776
|
+
]
|
777
|
+
timezone_aware_pydt_cols = [
|
778
|
+
col
|
779
|
+
for col in pydt_cols
|
780
|
+
if df.loc[cols_indices[col]][col].tzinfo is not None
|
781
|
+
]
|
782
|
+
timezone_aware_dt_cols_set = set(known_timezone_aware_dt_cols + timezone_aware_pydt_cols)
|
783
|
+
if timezone_aware:
|
784
|
+
return [
|
785
|
+
col
|
786
|
+
for col in all_dt_cols
|
787
|
+
if col in timezone_aware_pydt_cols
|
788
|
+
]
|
789
|
+
|
790
|
+
return [
|
791
|
+
col
|
792
|
+
for col in all_dt_cols
|
793
|
+
if col not in timezone_aware_dt_cols_set
|
794
|
+
]
|
795
|
+
|
796
|
+
|
797
|
+
def get_bytes_cols(df: 'pd.DataFrame') -> List[str]:
|
798
|
+
"""
|
799
|
+
Get the columns which contain bytes strings from a Pandas DataFrame.
|
800
|
+
|
801
|
+
Parameters
|
802
|
+
----------
|
803
|
+
df: pd.DataFrame
|
804
|
+
The DataFrame which may contain bytes strings.
|
805
|
+
|
806
|
+
Returns
|
807
|
+
-------
|
808
|
+
A list of columns to treat as bytes.
|
809
|
+
"""
|
810
|
+
if df is None:
|
811
|
+
return []
|
812
|
+
is_dask = 'dask' in df.__module__
|
813
|
+
if is_dask:
|
814
|
+
df = get_first_valid_dask_partition(df)
|
815
|
+
|
816
|
+
if len(df) == 0:
|
817
|
+
return []
|
818
|
+
|
819
|
+
cols_indices = {
|
820
|
+
col: df[col].first_valid_index()
|
821
|
+
for col in df.columns
|
822
|
+
}
|
823
|
+
return [
|
824
|
+
col
|
825
|
+
for col, ix in cols_indices.items()
|
826
|
+
if (
|
827
|
+
ix is not None
|
828
|
+
and
|
829
|
+
isinstance(df.loc[ix][col], bytes)
|
830
|
+
)
|
831
|
+
]
|
832
|
+
|
833
|
+
|
695
834
|
def enforce_dtypes(
|
696
835
|
df: 'pd.DataFrame',
|
697
836
|
dtypes: Dict[str, str],
|
@@ -743,6 +882,7 @@ def enforce_dtypes(
|
|
743
882
|
is_dtype_numeric,
|
744
883
|
attempt_cast_to_numeric,
|
745
884
|
attempt_cast_to_uuid,
|
885
|
+
attempt_cast_to_bytes,
|
746
886
|
coerce_timezone as _coerce_timezone,
|
747
887
|
)
|
748
888
|
pandas = mrsm.attempt_import('pandas')
|
@@ -773,6 +913,11 @@ def enforce_dtypes(
|
|
773
913
|
for col, typ in dtypes.items()
|
774
914
|
if typ == 'uuid'
|
775
915
|
]
|
916
|
+
bytes_cols = [
|
917
|
+
col
|
918
|
+
for col, typ in dtypes.items()
|
919
|
+
if typ == 'bytes'
|
920
|
+
]
|
776
921
|
datetime_cols = [
|
777
922
|
col
|
778
923
|
for col, typ in dtypes.items()
|
@@ -826,6 +971,17 @@ def enforce_dtypes(
|
|
826
971
|
if debug:
|
827
972
|
dprint(f"Unable to parse column '{col}' as UUID:\n{e}")
|
828
973
|
|
974
|
+
if bytes_cols:
|
975
|
+
if debug:
|
976
|
+
dprint(f"Checking for bytes: {bytes_cols}")
|
977
|
+
for col in bytes_cols:
|
978
|
+
if col in df.columns:
|
979
|
+
try:
|
980
|
+
df[col] = df[col].apply(attempt_cast_to_bytes)
|
981
|
+
except Exception as e:
|
982
|
+
if debug:
|
983
|
+
dprint(f"Unable to parse column '{col}' as bytes:\n{e}")
|
984
|
+
|
829
985
|
if datetime_cols and coerce_timezone:
|
830
986
|
if debug:
|
831
987
|
dprint(f"Checking for datetime conversion: {datetime_cols}")
|
@@ -931,6 +1087,8 @@ def get_datetime_bound_from_df(
|
|
931
1087
|
-------
|
932
1088
|
The minimum or maximum datetime value in the dataframe, or `None`.
|
933
1089
|
"""
|
1090
|
+
from meerschaum.utils.dtypes import to_datetime, value_is_null
|
1091
|
+
|
934
1092
|
if df is None:
|
935
1093
|
return None
|
936
1094
|
if not datetime_column:
|
@@ -982,9 +1140,9 @@ def get_datetime_bound_from_df(
|
|
982
1140
|
dt_val = dt_val.compute()
|
983
1141
|
|
984
1142
|
return (
|
985
|
-
|
1143
|
+
to_datetime(dt_val, as_pydatetime=True)
|
986
1144
|
if are_dtypes_equal(str(type(dt_val)), 'datetime')
|
987
|
-
else (dt_val if
|
1145
|
+
else (dt_val if not value_is_null(dt_val) else None)
|
988
1146
|
)
|
989
1147
|
|
990
1148
|
return None
|
@@ -1127,7 +1285,7 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
|
|
1127
1285
|
for partition in ddf.partitions:
|
1128
1286
|
try:
|
1129
1287
|
pdf = partition.compute()
|
1130
|
-
except Exception
|
1288
|
+
except Exception:
|
1131
1289
|
continue
|
1132
1290
|
if len(pdf) > 0:
|
1133
1291
|
return pdf
|
@@ -1408,12 +1566,16 @@ def to_json(
|
|
1408
1566
|
A JSON string.
|
1409
1567
|
"""
|
1410
1568
|
from meerschaum.utils.packages import import_pandas
|
1569
|
+
from meerschaum.utils.dtypes import serialize_bytes
|
1411
1570
|
pd = import_pandas()
|
1412
1571
|
uuid_cols = get_uuid_cols(df)
|
1413
|
-
|
1572
|
+
bytes_cols = get_bytes_cols(df)
|
1573
|
+
if safe_copy and bool(uuid_cols or bytes_cols):
|
1414
1574
|
df = df.copy()
|
1415
1575
|
for col in uuid_cols:
|
1416
1576
|
df[col] = df[col].astype(str)
|
1577
|
+
for col in bytes_cols:
|
1578
|
+
df[col] = df[col].apply(serialize_bytes)
|
1417
1579
|
return df.infer_objects(copy=False).fillna(pd.NA).to_json(
|
1418
1580
|
date_format=date_format,
|
1419
1581
|
date_unit=date_unit,
|
@@ -15,7 +15,19 @@ import meerschaum as mrsm
|
|
15
15
|
from meerschaum.utils.typing import Dict, Union, Any
|
16
16
|
from meerschaum.utils.warnings import warn
|
17
17
|
|
18
|
-
|
18
|
+
MRSM_ALIAS_DTYPES: Dict[str, str] = {
|
19
|
+
'decimal': 'numeric',
|
20
|
+
'number': 'numeric',
|
21
|
+
'jsonl': 'json',
|
22
|
+
'JSON': 'json',
|
23
|
+
'binary': 'bytes',
|
24
|
+
'blob': 'bytes',
|
25
|
+
'varbinary': 'bytes',
|
26
|
+
'bytea': 'bytes',
|
27
|
+
'guid': 'uuid',
|
28
|
+
'UUID': 'uuid',
|
29
|
+
}
|
30
|
+
MRSM_PD_DTYPES: Dict[Union[str, None], str] = {
|
19
31
|
'json': 'object',
|
20
32
|
'numeric': 'object',
|
21
33
|
'uuid': 'object',
|
@@ -27,6 +39,8 @@ MRSM_PD_DTYPES: Dict[str, str] = {
|
|
27
39
|
'int32': 'Int32',
|
28
40
|
'int64': 'Int64',
|
29
41
|
'str': 'string[python]',
|
42
|
+
'bytes': 'object',
|
43
|
+
None: 'object',
|
30
44
|
}
|
31
45
|
|
32
46
|
|
@@ -38,6 +52,10 @@ def to_pandas_dtype(dtype: str) -> str:
|
|
38
52
|
if known_dtype is not None:
|
39
53
|
return known_dtype
|
40
54
|
|
55
|
+
alias_dtype = MRSM_ALIAS_DTYPES.get(dtype, None)
|
56
|
+
if alias_dtype is not None:
|
57
|
+
return MRSM_PD_DTYPES[alias_dtype]
|
58
|
+
|
41
59
|
### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
|
42
60
|
### treat it as a SQL db type.
|
43
61
|
if dtype.split(' ')[0].isupper():
|
@@ -95,7 +113,7 @@ def are_dtypes_equal(
|
|
95
113
|
try:
|
96
114
|
if ldtype == rdtype:
|
97
115
|
return True
|
98
|
-
except Exception
|
116
|
+
except Exception:
|
99
117
|
warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
|
100
118
|
return False
|
101
119
|
|
@@ -115,6 +133,10 @@ def are_dtypes_equal(
|
|
115
133
|
if ldtype in uuid_dtypes and rdtype in uuid_dtypes:
|
116
134
|
return True
|
117
135
|
|
136
|
+
bytes_dtypes = ('bytes', 'object')
|
137
|
+
if ldtype in bytes_dtypes and rdtype in bytes_dtypes:
|
138
|
+
return True
|
139
|
+
|
118
140
|
ldtype_clean = ldtype.split('[', maxsplit=1)[0]
|
119
141
|
rdtype_clean = rdtype.split('[', maxsplit=1)[0]
|
120
142
|
|
@@ -185,7 +207,7 @@ def attempt_cast_to_numeric(value: Any) -> Any:
|
|
185
207
|
if not value_is_null(value)
|
186
208
|
else Decimal('NaN')
|
187
209
|
)
|
188
|
-
except Exception
|
210
|
+
except Exception:
|
189
211
|
return value
|
190
212
|
|
191
213
|
|
@@ -201,7 +223,23 @@ def attempt_cast_to_uuid(value: Any) -> Any:
|
|
201
223
|
if not value_is_null(value)
|
202
224
|
else None
|
203
225
|
)
|
204
|
-
except Exception
|
226
|
+
except Exception:
|
227
|
+
return value
|
228
|
+
|
229
|
+
|
230
|
+
def attempt_cast_to_bytes(value: Any) -> Any:
|
231
|
+
"""
|
232
|
+
Given a value, attempt to coerce it into a bytestring.
|
233
|
+
"""
|
234
|
+
if isinstance(value, bytes):
|
235
|
+
return value
|
236
|
+
try:
|
237
|
+
return (
|
238
|
+
deserialize_bytes_string(str(value))
|
239
|
+
if not value_is_null(value)
|
240
|
+
else None
|
241
|
+
)
|
242
|
+
except Exception:
|
205
243
|
return value
|
206
244
|
|
207
245
|
|
@@ -251,7 +289,7 @@ def coerce_timezone(
|
|
251
289
|
) -> Any:
|
252
290
|
"""
|
253
291
|
Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
|
254
|
-
return a
|
292
|
+
return a UTC timestamp (strip timezone if `strip_utc` is `True`.
|
255
293
|
"""
|
256
294
|
if dt is None:
|
257
295
|
return None
|
@@ -266,9 +304,7 @@ def coerce_timezone(
|
|
266
304
|
dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__')
|
267
305
|
|
268
306
|
if dt_is_series:
|
269
|
-
is_dask = 'dask' in dt.__module__
|
270
307
|
pandas = mrsm.attempt_import('pandas', lazy=False)
|
271
|
-
dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
|
272
308
|
|
273
309
|
if (
|
274
310
|
pandas.api.types.is_datetime64_any_dtype(dt) and (
|
@@ -279,14 +315,13 @@ def coerce_timezone(
|
|
279
315
|
):
|
280
316
|
return dt
|
281
317
|
|
282
|
-
dt_series = (
|
283
|
-
pandas.to_datetime(dt, utc=True, format='ISO8601')
|
284
|
-
if dd is None
|
285
|
-
else dd.to_datetime(dt, utc=True, format='ISO8601')
|
286
|
-
)
|
318
|
+
dt_series = to_datetime(dt, coerce_utc=False)
|
287
319
|
if strip_utc:
|
288
|
-
|
289
|
-
|
320
|
+
try:
|
321
|
+
if dt_series.dt.tz is not None:
|
322
|
+
dt_series = dt_series.dt.tz_localize(None)
|
323
|
+
except Exception:
|
324
|
+
pass
|
290
325
|
|
291
326
|
return dt_series
|
292
327
|
|
@@ -299,3 +334,103 @@ def coerce_timezone(
|
|
299
334
|
if strip_utc:
|
300
335
|
return utc_dt.replace(tzinfo=None)
|
301
336
|
return utc_dt
|
337
|
+
|
338
|
+
|
339
|
+
def to_datetime(dt_val: Any, as_pydatetime: bool = False, coerce_utc: bool = True) -> Any:
|
340
|
+
"""
|
341
|
+
Wrap `pd.to_datetime()` and add support for out-of-bounds values.
|
342
|
+
"""
|
343
|
+
pandas, dateutil_parser = mrsm.attempt_import('pandas', 'dateutil.parser', lazy=False)
|
344
|
+
is_dask = 'dask' in getattr(dt_val, '__module__', '')
|
345
|
+
dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
|
346
|
+
dt_is_series = hasattr(dt_val, 'dtype') and hasattr(dt_val, '__module__')
|
347
|
+
pd = pandas if dd is None else dd
|
348
|
+
|
349
|
+
try:
|
350
|
+
new_dt_val = pd.to_datetime(dt_val, utc=True, format='ISO8601')
|
351
|
+
if as_pydatetime:
|
352
|
+
return new_dt_val.to_pydatetime()
|
353
|
+
return new_dt_val
|
354
|
+
except (pd.errors.OutOfBoundsDatetime, ValueError):
|
355
|
+
pass
|
356
|
+
|
357
|
+
def parse(x: Any) -> Any:
|
358
|
+
try:
|
359
|
+
return dateutil_parser.parse(x)
|
360
|
+
except Exception:
|
361
|
+
return x
|
362
|
+
|
363
|
+
if dt_is_series:
|
364
|
+
new_series = dt_val.apply(parse)
|
365
|
+
if coerce_utc:
|
366
|
+
return coerce_timezone(new_series)
|
367
|
+
return new_series
|
368
|
+
|
369
|
+
new_dt_val = parse(dt_val)
|
370
|
+
if not coerce_utc:
|
371
|
+
return new_dt_val
|
372
|
+
return coerce_timezone(new_dt_val)
|
373
|
+
|
374
|
+
|
375
|
+
def serialize_bytes(data: bytes) -> str:
|
376
|
+
"""
|
377
|
+
Return the given bytes as a base64-encoded string.
|
378
|
+
"""
|
379
|
+
import base64
|
380
|
+
if not isinstance(data, bytes) and value_is_null(data):
|
381
|
+
return data
|
382
|
+
return base64.b64encode(data).decode('utf-8')
|
383
|
+
|
384
|
+
|
385
|
+
def deserialize_bytes_string(data: str | None, force_hex: bool = False) -> bytes | None:
|
386
|
+
"""
|
387
|
+
Given a serialized ASCII string of bytes data, return the original bytes.
|
388
|
+
The input data may either be base64- or hex-encoded.
|
389
|
+
|
390
|
+
Parameters
|
391
|
+
----------
|
392
|
+
data: str | None
|
393
|
+
The string to be deserialized into bytes.
|
394
|
+
May be base64- or hex-encoded (prefixed with `'\\x'`).
|
395
|
+
|
396
|
+
force_hex: bool = False
|
397
|
+
If `True`, treat the input string as hex-encoded.
|
398
|
+
If `data` does not begin with the prefix `'\\x'`, set `force_hex` to `True`.
|
399
|
+
This will still strip the leading `'\\x'` prefix if present.
|
400
|
+
|
401
|
+
Returns
|
402
|
+
-------
|
403
|
+
The original bytes used to produce the encoded string `data`.
|
404
|
+
"""
|
405
|
+
if not isinstance(data, str) and value_is_null(data):
|
406
|
+
return data
|
407
|
+
|
408
|
+
import binascii
|
409
|
+
import base64
|
410
|
+
|
411
|
+
is_hex = force_hex or data.startswith('\\x')
|
412
|
+
|
413
|
+
if is_hex:
|
414
|
+
if data.startswith('\\x'):
|
415
|
+
data = data[2:]
|
416
|
+
return binascii.unhexlify(data)
|
417
|
+
|
418
|
+
return base64.b64decode(data)
|
419
|
+
|
420
|
+
|
421
|
+
def deserialize_base64(data: str) -> bytes:
|
422
|
+
"""
|
423
|
+
Return the original bytestring from the given base64-encoded string.
|
424
|
+
"""
|
425
|
+
import base64
|
426
|
+
return base64.b64decode(data)
|
427
|
+
|
428
|
+
|
429
|
+
def encode_bytes_for_bytea(data: bytes, with_prefix: bool = True) -> str | None:
|
430
|
+
"""
|
431
|
+
Return the given bytes as a hex string for PostgreSQL's `BYTEA` type.
|
432
|
+
"""
|
433
|
+
import binascii
|
434
|
+
if not isinstance(data, bytes) and value_is_null(data):
|
435
|
+
return data
|
436
|
+
return ('\\x' if with_prefix else '') + binascii.hexlify(data).decode('utf-8')
|
meerschaum/utils/dtypes/sql.py
CHANGED
@@ -13,9 +13,8 @@ NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
|
|
13
13
|
'mariadb': (38, 20),
|
14
14
|
'mysql': (38, 20),
|
15
15
|
'mssql': (28, 10),
|
16
|
-
'duckdb': (15, 3),
|
17
|
-
'sqlite': (15, 4),
|
18
16
|
}
|
17
|
+
NUMERIC_AS_TEXT_FLAVORS = {'sqlite', 'duckdb'}
|
19
18
|
TIMEZONE_NAIVE_FLAVORS = {'oracle', 'mysql', 'mariadb'}
|
20
19
|
|
21
20
|
### MySQL doesn't allow for casting as BIGINT, so this is a workaround.
|
@@ -102,6 +101,10 @@ DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
|
|
102
101
|
'JSONB': 'json',
|
103
102
|
'UUID': 'uuid',
|
104
103
|
'UNIQUEIDENTIFIER': 'uuid',
|
104
|
+
'BYTEA': 'bytes',
|
105
|
+
'BLOB': 'bytes',
|
106
|
+
'VARBINARY': 'bytes',
|
107
|
+
'VARBINARY(MAX)': 'bytes',
|
105
108
|
'substrings': {
|
106
109
|
'CHAR': 'string[pyarrow]',
|
107
110
|
'TIMESTAMP': 'datetime64[ns]',
|
@@ -114,6 +117,9 @@ DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
|
|
114
117
|
'INT': 'int64[pyarrow]',
|
115
118
|
'BOOL': 'bool[pyarrow]',
|
116
119
|
'JSON': 'json',
|
120
|
+
'BYTE': 'bytes',
|
121
|
+
'LOB': 'bytes',
|
122
|
+
'BINARY': 'bytes',
|
117
123
|
},
|
118
124
|
'default': 'object',
|
119
125
|
}
|
@@ -256,8 +262,8 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
256
262
|
'mysql': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mysql"]}',
|
257
263
|
'mssql': f'NUMERIC{NUMERIC_PRECISION_FLAVORS["mssql"]}',
|
258
264
|
'oracle': 'NUMBER',
|
259
|
-
'sqlite':
|
260
|
-
'duckdb': '
|
265
|
+
'sqlite': 'TEXT',
|
266
|
+
'duckdb': 'TEXT',
|
261
267
|
'citus': 'NUMERIC',
|
262
268
|
'cockroachdb': 'NUMERIC',
|
263
269
|
'default': 'NUMERIC',
|
@@ -276,6 +282,19 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
276
282
|
'cockroachdb': 'UUID',
|
277
283
|
'default': 'TEXT',
|
278
284
|
},
|
285
|
+
'bytes': {
|
286
|
+
'timescaledb': 'BYTEA',
|
287
|
+
'postgresql': 'BYTEA',
|
288
|
+
'mariadb': 'BLOB',
|
289
|
+
'mysql': 'BLOB',
|
290
|
+
'mssql': 'VARBINARY(MAX)',
|
291
|
+
'oracle': 'BLOB',
|
292
|
+
'sqlite': 'BLOB',
|
293
|
+
'duckdb': 'BLOB',
|
294
|
+
'citus': 'BYTEA',
|
295
|
+
'cockroachdb': 'BYTEA',
|
296
|
+
'default': 'BLOB',
|
297
|
+
},
|
279
298
|
}
|
280
299
|
PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
281
300
|
'int': {
|
@@ -402,7 +421,7 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
402
421
|
'mysql': 'Numeric',
|
403
422
|
'mssql': 'Numeric',
|
404
423
|
'oracle': 'Numeric',
|
405
|
-
'sqlite': '
|
424
|
+
'sqlite': 'UnicodeText',
|
406
425
|
'duckdb': 'Numeric',
|
407
426
|
'citus': 'Numeric',
|
408
427
|
'cockroachdb': 'Numeric',
|
@@ -421,6 +440,19 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
421
440
|
'cockroachdb': 'Uuid',
|
422
441
|
'default': 'Uuid',
|
423
442
|
},
|
443
|
+
'bytes': {
|
444
|
+
'timescaledb': 'LargeBinary',
|
445
|
+
'postgresql': 'LargeBinary',
|
446
|
+
'mariadb': 'LargeBinary',
|
447
|
+
'mysql': 'LargeBinary',
|
448
|
+
'mssql': 'LargeBinary',
|
449
|
+
'oracle': 'LargeBinary',
|
450
|
+
'sqlite': 'LargeBinary',
|
451
|
+
'duckdb': 'LargeBinary',
|
452
|
+
'citus': 'LargeBinary',
|
453
|
+
'cockroachdb': 'LargeBinary',
|
454
|
+
'default': 'LargeBinary',
|
455
|
+
},
|
424
456
|
}
|
425
457
|
|
426
458
|
AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = {
|
@@ -502,7 +534,7 @@ def get_db_type_from_pd_type(
|
|
502
534
|
"""
|
503
535
|
from meerschaum.utils.warnings import warn
|
504
536
|
from meerschaum.utils.packages import attempt_import
|
505
|
-
from meerschaum.utils.dtypes import are_dtypes_equal
|
537
|
+
from meerschaum.utils.dtypes import are_dtypes_equal, MRSM_ALIAS_DTYPES
|
506
538
|
from meerschaum.utils.misc import parse_arguments_str
|
507
539
|
sqlalchemy_types = attempt_import('sqlalchemy.types')
|
508
540
|
|
@@ -512,6 +544,9 @@ def get_db_type_from_pd_type(
|
|
512
544
|
else PD_TO_SQLALCHEMY_DTYPES_FLAVORS
|
513
545
|
)
|
514
546
|
|
547
|
+
if pd_type in MRSM_ALIAS_DTYPES:
|
548
|
+
pd_type = MRSM_ALIAS_DTYPES[pd_type]
|
549
|
+
|
515
550
|
### Check whether we are able to match this type (e.g. pyarrow support).
|
516
551
|
found_db_type = False
|
517
552
|
if pd_type not in types_registry:
|
@@ -568,7 +603,6 @@ def get_db_type_from_pd_type(
|
|
568
603
|
return cls(*cls_args, **cls_kwargs)
|
569
604
|
|
570
605
|
if 'numeric' in db_type.lower():
|
571
|
-
numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
|
572
606
|
if flavor not in NUMERIC_PRECISION_FLAVORS:
|
573
607
|
return sqlalchemy_types.Numeric
|
574
608
|
precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]
|