meerschaum 2.6.17__py3-none-any.whl → 2.7.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- meerschaum/actions/delete.py +65 -69
- meerschaum/actions/install.py +1 -2
- meerschaum/api/routes/_pipes.py +7 -8
- meerschaum/config/_default.py +1 -1
- meerschaum/config/_paths.py +2 -1
- meerschaum/config/_version.py +1 -1
- meerschaum/connectors/api/_pipes.py +18 -21
- meerschaum/connectors/sql/_instance.py +11 -12
- meerschaum/connectors/sql/_pipes.py +122 -78
- meerschaum/connectors/sql/_sql.py +43 -8
- meerschaum/connectors/valkey/_pipes.py +12 -1
- meerschaum/core/Pipe/__init__.py +23 -13
- meerschaum/core/Pipe/_attributes.py +25 -1
- meerschaum/core/Pipe/_dtypes.py +23 -16
- meerschaum/core/Pipe/_sync.py +59 -31
- meerschaum/core/Pipe/_verify.py +8 -7
- meerschaum/jobs/_Job.py +2 -0
- meerschaum/plugins/_Plugin.py +11 -14
- meerschaum/utils/daemon/Daemon.py +20 -13
- meerschaum/utils/dataframe.py +178 -16
- meerschaum/utils/dtypes/__init__.py +149 -14
- meerschaum/utils/dtypes/sql.py +41 -7
- meerschaum/utils/misc.py +8 -8
- meerschaum/utils/sql.py +174 -64
- meerschaum/utils/venv/_Venv.py +4 -4
- meerschaum/utils/venv/__init__.py +53 -20
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/METADATA +1 -1
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/RECORD +34 -34
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/LICENSE +0 -0
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/NOTICE +0 -0
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/WHEEL +0 -0
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/top_level.txt +0 -0
- {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/zip-safe +0 -0
meerschaum/utils/dataframe.py
CHANGED
@@ -139,7 +139,6 @@ def filter_unseen_df(
|
|
139
139
|
import functools
|
140
140
|
import traceback
|
141
141
|
from decimal import Decimal
|
142
|
-
from uuid import UUID
|
143
142
|
from meerschaum.utils.warnings import warn
|
144
143
|
from meerschaum.utils.packages import import_pandas, attempt_import
|
145
144
|
from meerschaum.utils.dtypes import (
|
@@ -147,6 +146,7 @@ def filter_unseen_df(
|
|
147
146
|
are_dtypes_equal,
|
148
147
|
attempt_cast_to_numeric,
|
149
148
|
attempt_cast_to_uuid,
|
149
|
+
attempt_cast_to_bytes,
|
150
150
|
coerce_timezone,
|
151
151
|
)
|
152
152
|
pd = import_pandas(debug=debug)
|
@@ -333,6 +333,11 @@ def filter_unseen_df(
|
|
333
333
|
old_uuid_cols = get_uuid_cols(old_df)
|
334
334
|
new_uuid_cols = get_uuid_cols(new_df)
|
335
335
|
uuid_cols = set(new_uuid_cols + old_uuid_cols)
|
336
|
+
|
337
|
+
old_bytes_cols = get_bytes_cols(old_df)
|
338
|
+
new_bytes_cols = get_bytes_cols(new_df)
|
339
|
+
bytes_cols = set(new_bytes_cols + old_bytes_cols)
|
340
|
+
|
336
341
|
joined_df = merge(
|
337
342
|
new_df.infer_objects(copy=False).fillna(NA),
|
338
343
|
old_df.infer_objects(copy=False).fillna(NA),
|
@@ -368,6 +373,14 @@ def filter_unseen_df(
|
|
368
373
|
except Exception:
|
369
374
|
warn(f"Unable to parse numeric column '{uuid_col}':\n{traceback.format_exc()}")
|
370
375
|
|
376
|
+
for bytes_col in bytes_cols:
|
377
|
+
if bytes_col not in delta_df.columns:
|
378
|
+
continue
|
379
|
+
try:
|
380
|
+
delta_df[bytes_col] = delta_df[bytes_col].apply(attempt_cast_to_bytes)
|
381
|
+
except Exception:
|
382
|
+
warn(f"Unable to parse bytes column '{bytes_col}':\n{traceback.format_exc()}")
|
383
|
+
|
371
384
|
return delta_df
|
372
385
|
|
373
386
|
|
@@ -429,6 +442,7 @@ def parse_df_datetimes(
|
|
429
442
|
from meerschaum.utils.debug import dprint
|
430
443
|
from meerschaum.utils.warnings import warn
|
431
444
|
from meerschaum.utils.misc import items_str
|
445
|
+
from meerschaum.utils.dtypes import to_datetime
|
432
446
|
import traceback
|
433
447
|
pd = import_pandas()
|
434
448
|
pandas = attempt_import('pandas')
|
@@ -480,7 +494,7 @@ def parse_df_datetimes(
|
|
480
494
|
### skip parsing if DataFrame is empty
|
481
495
|
if len(pdf) == 0:
|
482
496
|
if debug:
|
483
|
-
dprint(
|
497
|
+
dprint("df is empty. Returning original DataFrame without casting datetime columns...")
|
484
498
|
return df
|
485
499
|
|
486
500
|
ignore_cols = set(
|
@@ -494,8 +508,8 @@ def parse_df_datetimes(
|
|
494
508
|
|
495
509
|
if len(cols_to_inspect) == 0:
|
496
510
|
if debug:
|
497
|
-
dprint(
|
498
|
-
return df.fillna(pandas.NA)
|
511
|
+
dprint("All columns are ignored, skipping datetime detection...")
|
512
|
+
return df.infer_objects(copy=False).fillna(pandas.NA)
|
499
513
|
|
500
514
|
### apply regex to columns to determine which are ISO datetimes
|
501
515
|
iso_dt_regex = r'\d{4}-\d{2}-\d{2}.\d{2}\:\d{2}\:\d+'
|
@@ -508,21 +522,17 @@ def parse_df_datetimes(
|
|
508
522
|
if not datetime_cols:
|
509
523
|
if debug:
|
510
524
|
dprint("No columns detected as datetimes, returning...")
|
511
|
-
return df.fillna(pandas.NA)
|
525
|
+
return df.infer_objects(copy=False).fillna(pandas.NA)
|
512
526
|
|
513
527
|
if debug:
|
514
528
|
dprint("Converting columns to datetimes: " + str(datetime_cols))
|
515
529
|
|
516
530
|
try:
|
517
531
|
if not using_dask:
|
518
|
-
df[datetime_cols] = df[datetime_cols].apply(
|
519
|
-
pd.to_datetime,
|
520
|
-
utc=True,
|
521
|
-
format='ISO8601',
|
522
|
-
)
|
532
|
+
df[datetime_cols] = df[datetime_cols].apply(to_datetime)
|
523
533
|
else:
|
524
534
|
df[datetime_cols] = df[datetime_cols].apply(
|
525
|
-
|
535
|
+
to_datetime,
|
526
536
|
utc=True,
|
527
537
|
axis=1,
|
528
538
|
meta={
|
@@ -665,7 +675,7 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
|
|
665
675
|
|
666
676
|
Returns
|
667
677
|
-------
|
668
|
-
A list of columns to treat as
|
678
|
+
A list of columns to treat as UUIDs.
|
669
679
|
"""
|
670
680
|
if df is None:
|
671
681
|
return []
|
@@ -692,6 +702,135 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
|
|
692
702
|
]
|
693
703
|
|
694
704
|
|
705
|
+
def get_datetime_cols(
|
706
|
+
df: 'pd.DataFrame',
|
707
|
+
timezone_aware: bool = True,
|
708
|
+
timezone_naive: bool = True,
|
709
|
+
) -> List[str]:
|
710
|
+
"""
|
711
|
+
Get the columns which contain `datetime` or `Timestamp` objects from a Pandas DataFrame.
|
712
|
+
|
713
|
+
Parameters
|
714
|
+
----------
|
715
|
+
df: pd.DataFrame
|
716
|
+
The DataFrame which may contain `datetime` or `Timestamp` objects.
|
717
|
+
|
718
|
+
timezone_aware: bool, default True
|
719
|
+
If `True`, include timezone-aware datetime columns.
|
720
|
+
|
721
|
+
timezone_naive: bool, default True
|
722
|
+
If `True`, include timezone-naive datetime columns.
|
723
|
+
|
724
|
+
Returns
|
725
|
+
-------
|
726
|
+
A list of columns to treat as datetimes.
|
727
|
+
"""
|
728
|
+
if not timezone_aware and not timezone_naive:
|
729
|
+
raise ValueError("`timezone_aware` and `timezone_naive` cannot both be `False`.")
|
730
|
+
|
731
|
+
if df is None:
|
732
|
+
return []
|
733
|
+
|
734
|
+
from datetime import datetime
|
735
|
+
from meerschaum.utils.dtypes import are_dtypes_equal
|
736
|
+
is_dask = 'dask' in df.__module__
|
737
|
+
if is_dask:
|
738
|
+
df = get_first_valid_dask_partition(df)
|
739
|
+
|
740
|
+
known_dt_cols = [
|
741
|
+
col
|
742
|
+
for col, typ in df.dtypes.items()
|
743
|
+
if are_dtypes_equal('datetime', str(typ))
|
744
|
+
]
|
745
|
+
|
746
|
+
if len(df) == 0:
|
747
|
+
return known_dt_cols
|
748
|
+
|
749
|
+
cols_indices = {
|
750
|
+
col: df[col].first_valid_index()
|
751
|
+
for col in df.columns
|
752
|
+
if col not in known_dt_cols
|
753
|
+
}
|
754
|
+
pydt_cols = [
|
755
|
+
col
|
756
|
+
for col, ix in cols_indices.items()
|
757
|
+
if (
|
758
|
+
ix is not None
|
759
|
+
and
|
760
|
+
isinstance(df.loc[ix][col], datetime)
|
761
|
+
)
|
762
|
+
]
|
763
|
+
dt_cols_set = set(known_dt_cols + pydt_cols)
|
764
|
+
all_dt_cols = [
|
765
|
+
col
|
766
|
+
for col in df.columns
|
767
|
+
if col in dt_cols_set
|
768
|
+
]
|
769
|
+
if timezone_aware and timezone_naive:
|
770
|
+
return all_dt_cols
|
771
|
+
|
772
|
+
known_timezone_aware_dt_cols = [
|
773
|
+
col
|
774
|
+
for col in known_dt_cols
|
775
|
+
if getattr(df[col], 'tz', None) is not None
|
776
|
+
]
|
777
|
+
timezone_aware_pydt_cols = [
|
778
|
+
col
|
779
|
+
for col in pydt_cols
|
780
|
+
if df.loc[cols_indices[col]][col].tzinfo is not None
|
781
|
+
]
|
782
|
+
timezone_aware_dt_cols_set = set(known_timezone_aware_dt_cols + timezone_aware_pydt_cols)
|
783
|
+
if timezone_aware:
|
784
|
+
return [
|
785
|
+
col
|
786
|
+
for col in all_dt_cols
|
787
|
+
if col in timezone_aware_pydt_cols
|
788
|
+
]
|
789
|
+
|
790
|
+
return [
|
791
|
+
col
|
792
|
+
for col in all_dt_cols
|
793
|
+
if col not in timezone_aware_dt_cols_set
|
794
|
+
]
|
795
|
+
|
796
|
+
|
797
|
+
def get_bytes_cols(df: 'pd.DataFrame') -> List[str]:
|
798
|
+
"""
|
799
|
+
Get the columns which contain bytes strings from a Pandas DataFrame.
|
800
|
+
|
801
|
+
Parameters
|
802
|
+
----------
|
803
|
+
df: pd.DataFrame
|
804
|
+
The DataFrame which may contain bytes strings.
|
805
|
+
|
806
|
+
Returns
|
807
|
+
-------
|
808
|
+
A list of columns to treat as bytes.
|
809
|
+
"""
|
810
|
+
if df is None:
|
811
|
+
return []
|
812
|
+
is_dask = 'dask' in df.__module__
|
813
|
+
if is_dask:
|
814
|
+
df = get_first_valid_dask_partition(df)
|
815
|
+
|
816
|
+
if len(df) == 0:
|
817
|
+
return []
|
818
|
+
|
819
|
+
cols_indices = {
|
820
|
+
col: df[col].first_valid_index()
|
821
|
+
for col in df.columns
|
822
|
+
}
|
823
|
+
return [
|
824
|
+
col
|
825
|
+
for col, ix in cols_indices.items()
|
826
|
+
if (
|
827
|
+
ix is not None
|
828
|
+
and
|
829
|
+
isinstance(df.loc[ix][col], bytes)
|
830
|
+
)
|
831
|
+
]
|
832
|
+
|
833
|
+
|
695
834
|
def enforce_dtypes(
|
696
835
|
df: 'pd.DataFrame',
|
697
836
|
dtypes: Dict[str, str],
|
@@ -743,6 +882,7 @@ def enforce_dtypes(
|
|
743
882
|
is_dtype_numeric,
|
744
883
|
attempt_cast_to_numeric,
|
745
884
|
attempt_cast_to_uuid,
|
885
|
+
attempt_cast_to_bytes,
|
746
886
|
coerce_timezone as _coerce_timezone,
|
747
887
|
)
|
748
888
|
pandas = mrsm.attempt_import('pandas')
|
@@ -773,6 +913,11 @@ def enforce_dtypes(
|
|
773
913
|
for col, typ in dtypes.items()
|
774
914
|
if typ == 'uuid'
|
775
915
|
]
|
916
|
+
bytes_cols = [
|
917
|
+
col
|
918
|
+
for col, typ in dtypes.items()
|
919
|
+
if typ == 'bytes'
|
920
|
+
]
|
776
921
|
datetime_cols = [
|
777
922
|
col
|
778
923
|
for col, typ in dtypes.items()
|
@@ -826,6 +971,17 @@ def enforce_dtypes(
|
|
826
971
|
if debug:
|
827
972
|
dprint(f"Unable to parse column '{col}' as UUID:\n{e}")
|
828
973
|
|
974
|
+
if bytes_cols:
|
975
|
+
if debug:
|
976
|
+
dprint(f"Checking for bytes: {bytes_cols}")
|
977
|
+
for col in bytes_cols:
|
978
|
+
if col in df.columns:
|
979
|
+
try:
|
980
|
+
df[col] = df[col].apply(attempt_cast_to_bytes)
|
981
|
+
except Exception as e:
|
982
|
+
if debug:
|
983
|
+
dprint(f"Unable to parse column '{col}' as bytes:\n{e}")
|
984
|
+
|
829
985
|
if datetime_cols and coerce_timezone:
|
830
986
|
if debug:
|
831
987
|
dprint(f"Checking for datetime conversion: {datetime_cols}")
|
@@ -931,6 +1087,8 @@ def get_datetime_bound_from_df(
|
|
931
1087
|
-------
|
932
1088
|
The minimum or maximum datetime value in the dataframe, or `None`.
|
933
1089
|
"""
|
1090
|
+
from meerschaum.utils.dtypes import to_datetime, value_is_null
|
1091
|
+
|
934
1092
|
if df is None:
|
935
1093
|
return None
|
936
1094
|
if not datetime_column:
|
@@ -982,9 +1140,9 @@ def get_datetime_bound_from_df(
|
|
982
1140
|
dt_val = dt_val.compute()
|
983
1141
|
|
984
1142
|
return (
|
985
|
-
|
1143
|
+
to_datetime(dt_val, as_pydatetime=True)
|
986
1144
|
if are_dtypes_equal(str(type(dt_val)), 'datetime')
|
987
|
-
else (dt_val if
|
1145
|
+
else (dt_val if not value_is_null(dt_val) else None)
|
988
1146
|
)
|
989
1147
|
|
990
1148
|
return None
|
@@ -1127,7 +1285,7 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
|
|
1127
1285
|
for partition in ddf.partitions:
|
1128
1286
|
try:
|
1129
1287
|
pdf = partition.compute()
|
1130
|
-
except Exception
|
1288
|
+
except Exception:
|
1131
1289
|
continue
|
1132
1290
|
if len(pdf) > 0:
|
1133
1291
|
return pdf
|
@@ -1408,12 +1566,16 @@ def to_json(
|
|
1408
1566
|
A JSON string.
|
1409
1567
|
"""
|
1410
1568
|
from meerschaum.utils.packages import import_pandas
|
1569
|
+
from meerschaum.utils.dtypes import serialize_bytes
|
1411
1570
|
pd = import_pandas()
|
1412
1571
|
uuid_cols = get_uuid_cols(df)
|
1413
|
-
|
1572
|
+
bytes_cols = get_bytes_cols(df)
|
1573
|
+
if safe_copy and bool(uuid_cols or bytes_cols):
|
1414
1574
|
df = df.copy()
|
1415
1575
|
for col in uuid_cols:
|
1416
1576
|
df[col] = df[col].astype(str)
|
1577
|
+
for col in bytes_cols:
|
1578
|
+
df[col] = df[col].apply(serialize_bytes)
|
1417
1579
|
return df.infer_objects(copy=False).fillna(pd.NA).to_json(
|
1418
1580
|
date_format=date_format,
|
1419
1581
|
date_unit=date_unit,
|
@@ -15,7 +15,19 @@ import meerschaum as mrsm
|
|
15
15
|
from meerschaum.utils.typing import Dict, Union, Any
|
16
16
|
from meerschaum.utils.warnings import warn
|
17
17
|
|
18
|
-
|
18
|
+
MRSM_ALIAS_DTYPES: Dict[str, str] = {
|
19
|
+
'decimal': 'numeric',
|
20
|
+
'number': 'numeric',
|
21
|
+
'jsonl': 'json',
|
22
|
+
'JSON': 'json',
|
23
|
+
'binary': 'bytes',
|
24
|
+
'blob': 'bytes',
|
25
|
+
'varbinary': 'bytes',
|
26
|
+
'bytea': 'bytes',
|
27
|
+
'guid': 'uuid',
|
28
|
+
'UUID': 'uuid',
|
29
|
+
}
|
30
|
+
MRSM_PD_DTYPES: Dict[Union[str, None], str] = {
|
19
31
|
'json': 'object',
|
20
32
|
'numeric': 'object',
|
21
33
|
'uuid': 'object',
|
@@ -27,6 +39,8 @@ MRSM_PD_DTYPES: Dict[str, str] = {
|
|
27
39
|
'int32': 'Int32',
|
28
40
|
'int64': 'Int64',
|
29
41
|
'str': 'string[python]',
|
42
|
+
'bytes': 'object',
|
43
|
+
None: 'object',
|
30
44
|
}
|
31
45
|
|
32
46
|
|
@@ -38,6 +52,10 @@ def to_pandas_dtype(dtype: str) -> str:
|
|
38
52
|
if known_dtype is not None:
|
39
53
|
return known_dtype
|
40
54
|
|
55
|
+
alias_dtype = MRSM_ALIAS_DTYPES.get(dtype, None)
|
56
|
+
if alias_dtype is not None:
|
57
|
+
return MRSM_PD_DTYPES[alias_dtype]
|
58
|
+
|
41
59
|
### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
|
42
60
|
### treat it as a SQL db type.
|
43
61
|
if dtype.split(' ')[0].isupper():
|
@@ -95,7 +113,7 @@ def are_dtypes_equal(
|
|
95
113
|
try:
|
96
114
|
if ldtype == rdtype:
|
97
115
|
return True
|
98
|
-
except Exception
|
116
|
+
except Exception:
|
99
117
|
warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
|
100
118
|
return False
|
101
119
|
|
@@ -115,6 +133,10 @@ def are_dtypes_equal(
|
|
115
133
|
if ldtype in uuid_dtypes and rdtype in uuid_dtypes:
|
116
134
|
return True
|
117
135
|
|
136
|
+
bytes_dtypes = ('bytes', 'object')
|
137
|
+
if ldtype in bytes_dtypes and rdtype in bytes_dtypes:
|
138
|
+
return True
|
139
|
+
|
118
140
|
ldtype_clean = ldtype.split('[', maxsplit=1)[0]
|
119
141
|
rdtype_clean = rdtype.split('[', maxsplit=1)[0]
|
120
142
|
|
@@ -185,7 +207,7 @@ def attempt_cast_to_numeric(value: Any) -> Any:
|
|
185
207
|
if not value_is_null(value)
|
186
208
|
else Decimal('NaN')
|
187
209
|
)
|
188
|
-
except Exception
|
210
|
+
except Exception:
|
189
211
|
return value
|
190
212
|
|
191
213
|
|
@@ -201,7 +223,23 @@ def attempt_cast_to_uuid(value: Any) -> Any:
|
|
201
223
|
if not value_is_null(value)
|
202
224
|
else None
|
203
225
|
)
|
204
|
-
except Exception
|
226
|
+
except Exception:
|
227
|
+
return value
|
228
|
+
|
229
|
+
|
230
|
+
def attempt_cast_to_bytes(value: Any) -> Any:
|
231
|
+
"""
|
232
|
+
Given a value, attempt to coerce it into a bytestring.
|
233
|
+
"""
|
234
|
+
if isinstance(value, bytes):
|
235
|
+
return value
|
236
|
+
try:
|
237
|
+
return (
|
238
|
+
deserialize_bytes_string(str(value))
|
239
|
+
if not value_is_null(value)
|
240
|
+
else None
|
241
|
+
)
|
242
|
+
except Exception:
|
205
243
|
return value
|
206
244
|
|
207
245
|
|
@@ -251,7 +289,7 @@ def coerce_timezone(
|
|
251
289
|
) -> Any:
|
252
290
|
"""
|
253
291
|
Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
|
254
|
-
return a
|
292
|
+
return a UTC timestamp (strip timezone if `strip_utc` is `True`.
|
255
293
|
"""
|
256
294
|
if dt is None:
|
257
295
|
return None
|
@@ -266,9 +304,7 @@ def coerce_timezone(
|
|
266
304
|
dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__')
|
267
305
|
|
268
306
|
if dt_is_series:
|
269
|
-
is_dask = 'dask' in dt.__module__
|
270
307
|
pandas = mrsm.attempt_import('pandas', lazy=False)
|
271
|
-
dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
|
272
308
|
|
273
309
|
if (
|
274
310
|
pandas.api.types.is_datetime64_any_dtype(dt) and (
|
@@ -279,14 +315,13 @@ def coerce_timezone(
|
|
279
315
|
):
|
280
316
|
return dt
|
281
317
|
|
282
|
-
dt_series = (
|
283
|
-
pandas.to_datetime(dt, utc=True, format='ISO8601')
|
284
|
-
if dd is None
|
285
|
-
else dd.to_datetime(dt, utc=True, format='ISO8601')
|
286
|
-
)
|
318
|
+
dt_series = to_datetime(dt, coerce_utc=False)
|
287
319
|
if strip_utc:
|
288
|
-
|
289
|
-
|
320
|
+
try:
|
321
|
+
if dt_series.dt.tz is not None:
|
322
|
+
dt_series = dt_series.dt.tz_localize(None)
|
323
|
+
except Exception:
|
324
|
+
pass
|
290
325
|
|
291
326
|
return dt_series
|
292
327
|
|
@@ -299,3 +334,103 @@ def coerce_timezone(
|
|
299
334
|
if strip_utc:
|
300
335
|
return utc_dt.replace(tzinfo=None)
|
301
336
|
return utc_dt
|
337
|
+
|
338
|
+
|
339
|
+
def to_datetime(dt_val: Any, as_pydatetime: bool = False, coerce_utc: bool = True) -> Any:
|
340
|
+
"""
|
341
|
+
Wrap `pd.to_datetime()` and add support for out-of-bounds values.
|
342
|
+
"""
|
343
|
+
pandas, dateutil_parser = mrsm.attempt_import('pandas', 'dateutil.parser', lazy=False)
|
344
|
+
is_dask = 'dask' in getattr(dt_val, '__module__', '')
|
345
|
+
dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
|
346
|
+
dt_is_series = hasattr(dt_val, 'dtype') and hasattr(dt_val, '__module__')
|
347
|
+
pd = pandas if dd is None else dd
|
348
|
+
|
349
|
+
try:
|
350
|
+
new_dt_val = pd.to_datetime(dt_val, utc=True, format='ISO8601')
|
351
|
+
if as_pydatetime:
|
352
|
+
return new_dt_val.to_pydatetime()
|
353
|
+
return new_dt_val
|
354
|
+
except (pd.errors.OutOfBoundsDatetime, ValueError):
|
355
|
+
pass
|
356
|
+
|
357
|
+
def parse(x: Any) -> Any:
|
358
|
+
try:
|
359
|
+
return dateutil_parser.parse(x)
|
360
|
+
except Exception:
|
361
|
+
return x
|
362
|
+
|
363
|
+
if dt_is_series:
|
364
|
+
new_series = dt_val.apply(parse)
|
365
|
+
if coerce_utc:
|
366
|
+
return coerce_timezone(new_series)
|
367
|
+
return new_series
|
368
|
+
|
369
|
+
new_dt_val = parse(dt_val)
|
370
|
+
if not coerce_utc:
|
371
|
+
return new_dt_val
|
372
|
+
return coerce_timezone(new_dt_val)
|
373
|
+
|
374
|
+
|
375
|
+
def serialize_bytes(data: bytes) -> str:
|
376
|
+
"""
|
377
|
+
Return the given bytes as a base64-encoded string.
|
378
|
+
"""
|
379
|
+
import base64
|
380
|
+
if not isinstance(data, bytes) and value_is_null(data):
|
381
|
+
return data
|
382
|
+
return base64.b64encode(data).decode('utf-8')
|
383
|
+
|
384
|
+
|
385
|
+
def deserialize_bytes_string(data: str | None, force_hex: bool = False) -> bytes | None:
|
386
|
+
"""
|
387
|
+
Given a serialized ASCII string of bytes data, return the original bytes.
|
388
|
+
The input data may either be base64- or hex-encoded.
|
389
|
+
|
390
|
+
Parameters
|
391
|
+
----------
|
392
|
+
data: str | None
|
393
|
+
The string to be deserialized into bytes.
|
394
|
+
May be base64- or hex-encoded (prefixed with `'\\x'`).
|
395
|
+
|
396
|
+
force_hex: bool = False
|
397
|
+
If `True`, treat the input string as hex-encoded.
|
398
|
+
If `data` does not begin with the prefix `'\\x'`, set `force_hex` to `True`.
|
399
|
+
This will still strip the leading `'\\x'` prefix if present.
|
400
|
+
|
401
|
+
Returns
|
402
|
+
-------
|
403
|
+
The original bytes used to produce the encoded string `data`.
|
404
|
+
"""
|
405
|
+
if not isinstance(data, str) and value_is_null(data):
|
406
|
+
return data
|
407
|
+
|
408
|
+
import binascii
|
409
|
+
import base64
|
410
|
+
|
411
|
+
is_hex = force_hex or data.startswith('\\x')
|
412
|
+
|
413
|
+
if is_hex:
|
414
|
+
if data.startswith('\\x'):
|
415
|
+
data = data[2:]
|
416
|
+
return binascii.unhexlify(data)
|
417
|
+
|
418
|
+
return base64.b64decode(data)
|
419
|
+
|
420
|
+
|
421
|
+
def deserialize_base64(data: str) -> bytes:
|
422
|
+
"""
|
423
|
+
Return the original bytestring from the given base64-encoded string.
|
424
|
+
"""
|
425
|
+
import base64
|
426
|
+
return base64.b64decode(data)
|
427
|
+
|
428
|
+
|
429
|
+
def encode_bytes_for_bytea(data: bytes, with_prefix: bool = True) -> str | None:
|
430
|
+
"""
|
431
|
+
Return the given bytes as a hex string for PostgreSQL's `BYTEA` type.
|
432
|
+
"""
|
433
|
+
import binascii
|
434
|
+
if not isinstance(data, bytes) and value_is_null(data):
|
435
|
+
return data
|
436
|
+
return ('\\x' if with_prefix else '') + binascii.hexlify(data).decode('utf-8')
|
meerschaum/utils/dtypes/sql.py
CHANGED
@@ -13,9 +13,8 @@ NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
|
|
13
13
|
'mariadb': (38, 20),
|
14
14
|
'mysql': (38, 20),
|
15
15
|
'mssql': (28, 10),
|
16
|
-
'duckdb': (15, 3),
|
17
|
-
'sqlite': (15, 4),
|
18
16
|
}
|
17
|
+
NUMERIC_AS_TEXT_FLAVORS = {'sqlite', 'duckdb'}
|
19
18
|
TIMEZONE_NAIVE_FLAVORS = {'oracle', 'mysql', 'mariadb'}
|
20
19
|
|
21
20
|
### MySQL doesn't allow for casting as BIGINT, so this is a workaround.
|
@@ -102,6 +101,10 @@ DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
|
|
102
101
|
'JSONB': 'json',
|
103
102
|
'UUID': 'uuid',
|
104
103
|
'UNIQUEIDENTIFIER': 'uuid',
|
104
|
+
'BYTEA': 'bytes',
|
105
|
+
'BLOB': 'bytes',
|
106
|
+
'VARBINARY': 'bytes',
|
107
|
+
'VARBINARY(MAX)': 'bytes',
|
105
108
|
'substrings': {
|
106
109
|
'CHAR': 'string[pyarrow]',
|
107
110
|
'TIMESTAMP': 'datetime64[ns]',
|
@@ -114,6 +117,9 @@ DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
|
|
114
117
|
'INT': 'int64[pyarrow]',
|
115
118
|
'BOOL': 'bool[pyarrow]',
|
116
119
|
'JSON': 'json',
|
120
|
+
'BYTE': 'bytes',
|
121
|
+
'LOB': 'bytes',
|
122
|
+
'BINARY': 'bytes',
|
117
123
|
},
|
118
124
|
'default': 'object',
|
119
125
|
}
|
@@ -256,8 +262,8 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
256
262
|
'mysql': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mysql"]}',
|
257
263
|
'mssql': f'NUMERIC{NUMERIC_PRECISION_FLAVORS["mssql"]}',
|
258
264
|
'oracle': 'NUMBER',
|
259
|
-
'sqlite':
|
260
|
-
'duckdb': '
|
265
|
+
'sqlite': 'TEXT',
|
266
|
+
'duckdb': 'TEXT',
|
261
267
|
'citus': 'NUMERIC',
|
262
268
|
'cockroachdb': 'NUMERIC',
|
263
269
|
'default': 'NUMERIC',
|
@@ -276,6 +282,19 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
276
282
|
'cockroachdb': 'UUID',
|
277
283
|
'default': 'TEXT',
|
278
284
|
},
|
285
|
+
'bytes': {
|
286
|
+
'timescaledb': 'BYTEA',
|
287
|
+
'postgresql': 'BYTEA',
|
288
|
+
'mariadb': 'BLOB',
|
289
|
+
'mysql': 'BLOB',
|
290
|
+
'mssql': 'VARBINARY(MAX)',
|
291
|
+
'oracle': 'BLOB',
|
292
|
+
'sqlite': 'BLOB',
|
293
|
+
'duckdb': 'BLOB',
|
294
|
+
'citus': 'BYTEA',
|
295
|
+
'cockroachdb': 'BYTEA',
|
296
|
+
'default': 'BLOB',
|
297
|
+
},
|
279
298
|
}
|
280
299
|
PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
281
300
|
'int': {
|
@@ -402,7 +421,7 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
402
421
|
'mysql': 'Numeric',
|
403
422
|
'mssql': 'Numeric',
|
404
423
|
'oracle': 'Numeric',
|
405
|
-
'sqlite': '
|
424
|
+
'sqlite': 'UnicodeText',
|
406
425
|
'duckdb': 'Numeric',
|
407
426
|
'citus': 'Numeric',
|
408
427
|
'cockroachdb': 'Numeric',
|
@@ -421,6 +440,19 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
|
|
421
440
|
'cockroachdb': 'Uuid',
|
422
441
|
'default': 'Uuid',
|
423
442
|
},
|
443
|
+
'bytes': {
|
444
|
+
'timescaledb': 'LargeBinary',
|
445
|
+
'postgresql': 'LargeBinary',
|
446
|
+
'mariadb': 'LargeBinary',
|
447
|
+
'mysql': 'LargeBinary',
|
448
|
+
'mssql': 'LargeBinary',
|
449
|
+
'oracle': 'LargeBinary',
|
450
|
+
'sqlite': 'LargeBinary',
|
451
|
+
'duckdb': 'LargeBinary',
|
452
|
+
'citus': 'LargeBinary',
|
453
|
+
'cockroachdb': 'LargeBinary',
|
454
|
+
'default': 'LargeBinary',
|
455
|
+
},
|
424
456
|
}
|
425
457
|
|
426
458
|
AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = {
|
@@ -502,7 +534,7 @@ def get_db_type_from_pd_type(
|
|
502
534
|
"""
|
503
535
|
from meerschaum.utils.warnings import warn
|
504
536
|
from meerschaum.utils.packages import attempt_import
|
505
|
-
from meerschaum.utils.dtypes import are_dtypes_equal
|
537
|
+
from meerschaum.utils.dtypes import are_dtypes_equal, MRSM_ALIAS_DTYPES
|
506
538
|
from meerschaum.utils.misc import parse_arguments_str
|
507
539
|
sqlalchemy_types = attempt_import('sqlalchemy.types')
|
508
540
|
|
@@ -512,6 +544,9 @@ def get_db_type_from_pd_type(
|
|
512
544
|
else PD_TO_SQLALCHEMY_DTYPES_FLAVORS
|
513
545
|
)
|
514
546
|
|
547
|
+
if pd_type in MRSM_ALIAS_DTYPES:
|
548
|
+
pd_type = MRSM_ALIAS_DTYPES[pd_type]
|
549
|
+
|
515
550
|
### Check whether we are able to match this type (e.g. pyarrow support).
|
516
551
|
found_db_type = False
|
517
552
|
if pd_type not in types_registry:
|
@@ -568,7 +603,6 @@ def get_db_type_from_pd_type(
|
|
568
603
|
return cls(*cls_args, **cls_kwargs)
|
569
604
|
|
570
605
|
if 'numeric' in db_type.lower():
|
571
|
-
numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
|
572
606
|
if flavor not in NUMERIC_PRECISION_FLAVORS:
|
573
607
|
return sqlalchemy_types.Numeric
|
574
608
|
precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]
|