meerschaum 2.6.17__py3-none-any.whl → 2.7.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. meerschaum/actions/delete.py +65 -69
  2. meerschaum/actions/install.py +1 -2
  3. meerschaum/api/routes/_pipes.py +7 -8
  4. meerschaum/config/_default.py +1 -1
  5. meerschaum/config/_paths.py +2 -1
  6. meerschaum/config/_version.py +1 -1
  7. meerschaum/connectors/api/_pipes.py +18 -21
  8. meerschaum/connectors/sql/_instance.py +11 -12
  9. meerschaum/connectors/sql/_pipes.py +122 -78
  10. meerschaum/connectors/sql/_sql.py +43 -8
  11. meerschaum/connectors/valkey/_pipes.py +12 -1
  12. meerschaum/core/Pipe/__init__.py +23 -13
  13. meerschaum/core/Pipe/_attributes.py +25 -1
  14. meerschaum/core/Pipe/_dtypes.py +23 -16
  15. meerschaum/core/Pipe/_sync.py +59 -31
  16. meerschaum/core/Pipe/_verify.py +8 -7
  17. meerschaum/jobs/_Job.py +2 -0
  18. meerschaum/plugins/_Plugin.py +11 -14
  19. meerschaum/utils/daemon/Daemon.py +20 -13
  20. meerschaum/utils/dataframe.py +178 -16
  21. meerschaum/utils/dtypes/__init__.py +149 -14
  22. meerschaum/utils/dtypes/sql.py +41 -7
  23. meerschaum/utils/misc.py +8 -8
  24. meerschaum/utils/sql.py +174 -64
  25. meerschaum/utils/venv/_Venv.py +4 -4
  26. meerschaum/utils/venv/__init__.py +53 -20
  27. {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/METADATA +1 -1
  28. {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/RECORD +34 -34
  29. {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/LICENSE +0 -0
  30. {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/NOTICE +0 -0
  31. {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/WHEEL +0 -0
  32. {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/entry_points.txt +0 -0
  33. {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/top_level.txt +0 -0
  34. {meerschaum-2.6.17.dist-info → meerschaum-2.7.0.dist-info}/zip-safe +0 -0
@@ -139,7 +139,6 @@ def filter_unseen_df(
139
139
  import functools
140
140
  import traceback
141
141
  from decimal import Decimal
142
- from uuid import UUID
143
142
  from meerschaum.utils.warnings import warn
144
143
  from meerschaum.utils.packages import import_pandas, attempt_import
145
144
  from meerschaum.utils.dtypes import (
@@ -147,6 +146,7 @@ def filter_unseen_df(
147
146
  are_dtypes_equal,
148
147
  attempt_cast_to_numeric,
149
148
  attempt_cast_to_uuid,
149
+ attempt_cast_to_bytes,
150
150
  coerce_timezone,
151
151
  )
152
152
  pd = import_pandas(debug=debug)
@@ -333,6 +333,11 @@ def filter_unseen_df(
333
333
  old_uuid_cols = get_uuid_cols(old_df)
334
334
  new_uuid_cols = get_uuid_cols(new_df)
335
335
  uuid_cols = set(new_uuid_cols + old_uuid_cols)
336
+
337
+ old_bytes_cols = get_bytes_cols(old_df)
338
+ new_bytes_cols = get_bytes_cols(new_df)
339
+ bytes_cols = set(new_bytes_cols + old_bytes_cols)
340
+
336
341
  joined_df = merge(
337
342
  new_df.infer_objects(copy=False).fillna(NA),
338
343
  old_df.infer_objects(copy=False).fillna(NA),
@@ -368,6 +373,14 @@ def filter_unseen_df(
368
373
  except Exception:
369
374
  warn(f"Unable to parse numeric column '{uuid_col}':\n{traceback.format_exc()}")
370
375
 
376
+ for bytes_col in bytes_cols:
377
+ if bytes_col not in delta_df.columns:
378
+ continue
379
+ try:
380
+ delta_df[bytes_col] = delta_df[bytes_col].apply(attempt_cast_to_bytes)
381
+ except Exception:
382
+ warn(f"Unable to parse bytes column '{bytes_col}':\n{traceback.format_exc()}")
383
+
371
384
  return delta_df
372
385
 
373
386
 
@@ -429,6 +442,7 @@ def parse_df_datetimes(
429
442
  from meerschaum.utils.debug import dprint
430
443
  from meerschaum.utils.warnings import warn
431
444
  from meerschaum.utils.misc import items_str
445
+ from meerschaum.utils.dtypes import to_datetime
432
446
  import traceback
433
447
  pd = import_pandas()
434
448
  pandas = attempt_import('pandas')
@@ -480,7 +494,7 @@ def parse_df_datetimes(
480
494
  ### skip parsing if DataFrame is empty
481
495
  if len(pdf) == 0:
482
496
  if debug:
483
- dprint(f"df is empty. Returning original DataFrame without casting datetime columns...")
497
+ dprint("df is empty. Returning original DataFrame without casting datetime columns...")
484
498
  return df
485
499
 
486
500
  ignore_cols = set(
@@ -494,8 +508,8 @@ def parse_df_datetimes(
494
508
 
495
509
  if len(cols_to_inspect) == 0:
496
510
  if debug:
497
- dprint(f"All columns are ignored, skipping datetime detection...")
498
- return df.fillna(pandas.NA)
511
+ dprint("All columns are ignored, skipping datetime detection...")
512
+ return df.infer_objects(copy=False).fillna(pandas.NA)
499
513
 
500
514
  ### apply regex to columns to determine which are ISO datetimes
501
515
  iso_dt_regex = r'\d{4}-\d{2}-\d{2}.\d{2}\:\d{2}\:\d+'
@@ -508,21 +522,17 @@ def parse_df_datetimes(
508
522
  if not datetime_cols:
509
523
  if debug:
510
524
  dprint("No columns detected as datetimes, returning...")
511
- return df.fillna(pandas.NA)
525
+ return df.infer_objects(copy=False).fillna(pandas.NA)
512
526
 
513
527
  if debug:
514
528
  dprint("Converting columns to datetimes: " + str(datetime_cols))
515
529
 
516
530
  try:
517
531
  if not using_dask:
518
- df[datetime_cols] = df[datetime_cols].apply(
519
- pd.to_datetime,
520
- utc=True,
521
- format='ISO8601',
522
- )
532
+ df[datetime_cols] = df[datetime_cols].apply(to_datetime)
523
533
  else:
524
534
  df[datetime_cols] = df[datetime_cols].apply(
525
- pd.to_datetime,
535
+ to_datetime,
526
536
  utc=True,
527
537
  axis=1,
528
538
  meta={
@@ -665,7 +675,7 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
665
675
 
666
676
  Returns
667
677
  -------
668
- A list of columns to treat as numerics.
678
+ A list of columns to treat as UUIDs.
669
679
  """
670
680
  if df is None:
671
681
  return []
@@ -692,6 +702,135 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
692
702
  ]
693
703
 
694
704
 
705
+ def get_datetime_cols(
706
+ df: 'pd.DataFrame',
707
+ timezone_aware: bool = True,
708
+ timezone_naive: bool = True,
709
+ ) -> List[str]:
710
+ """
711
+ Get the columns which contain `datetime` or `Timestamp` objects from a Pandas DataFrame.
712
+
713
+ Parameters
714
+ ----------
715
+ df: pd.DataFrame
716
+ The DataFrame which may contain `datetime` or `Timestamp` objects.
717
+
718
+ timezone_aware: bool, default True
719
+ If `True`, include timezone-aware datetime columns.
720
+
721
+ timezone_naive: bool, default True
722
+ If `True`, include timezone-naive datetime columns.
723
+
724
+ Returns
725
+ -------
726
+ A list of columns to treat as datetimes.
727
+ """
728
+ if not timezone_aware and not timezone_naive:
729
+ raise ValueError("`timezone_aware` and `timezone_naive` cannot both be `False`.")
730
+
731
+ if df is None:
732
+ return []
733
+
734
+ from datetime import datetime
735
+ from meerschaum.utils.dtypes import are_dtypes_equal
736
+ is_dask = 'dask' in df.__module__
737
+ if is_dask:
738
+ df = get_first_valid_dask_partition(df)
739
+
740
+ known_dt_cols = [
741
+ col
742
+ for col, typ in df.dtypes.items()
743
+ if are_dtypes_equal('datetime', str(typ))
744
+ ]
745
+
746
+ if len(df) == 0:
747
+ return known_dt_cols
748
+
749
+ cols_indices = {
750
+ col: df[col].first_valid_index()
751
+ for col in df.columns
752
+ if col not in known_dt_cols
753
+ }
754
+ pydt_cols = [
755
+ col
756
+ for col, ix in cols_indices.items()
757
+ if (
758
+ ix is not None
759
+ and
760
+ isinstance(df.loc[ix][col], datetime)
761
+ )
762
+ ]
763
+ dt_cols_set = set(known_dt_cols + pydt_cols)
764
+ all_dt_cols = [
765
+ col
766
+ for col in df.columns
767
+ if col in dt_cols_set
768
+ ]
769
+ if timezone_aware and timezone_naive:
770
+ return all_dt_cols
771
+
772
+ known_timezone_aware_dt_cols = [
773
+ col
774
+ for col in known_dt_cols
775
+ if getattr(df[col], 'tz', None) is not None
776
+ ]
777
+ timezone_aware_pydt_cols = [
778
+ col
779
+ for col in pydt_cols
780
+ if df.loc[cols_indices[col]][col].tzinfo is not None
781
+ ]
782
+ timezone_aware_dt_cols_set = set(known_timezone_aware_dt_cols + timezone_aware_pydt_cols)
783
+ if timezone_aware:
784
+ return [
785
+ col
786
+ for col in all_dt_cols
787
+ if col in timezone_aware_pydt_cols
788
+ ]
789
+
790
+ return [
791
+ col
792
+ for col in all_dt_cols
793
+ if col not in timezone_aware_dt_cols_set
794
+ ]
795
+
796
+
797
+ def get_bytes_cols(df: 'pd.DataFrame') -> List[str]:
798
+ """
799
+ Get the columns which contain bytes strings from a Pandas DataFrame.
800
+
801
+ Parameters
802
+ ----------
803
+ df: pd.DataFrame
804
+ The DataFrame which may contain bytes strings.
805
+
806
+ Returns
807
+ -------
808
+ A list of columns to treat as bytes.
809
+ """
810
+ if df is None:
811
+ return []
812
+ is_dask = 'dask' in df.__module__
813
+ if is_dask:
814
+ df = get_first_valid_dask_partition(df)
815
+
816
+ if len(df) == 0:
817
+ return []
818
+
819
+ cols_indices = {
820
+ col: df[col].first_valid_index()
821
+ for col in df.columns
822
+ }
823
+ return [
824
+ col
825
+ for col, ix in cols_indices.items()
826
+ if (
827
+ ix is not None
828
+ and
829
+ isinstance(df.loc[ix][col], bytes)
830
+ )
831
+ ]
832
+
833
+
695
834
  def enforce_dtypes(
696
835
  df: 'pd.DataFrame',
697
836
  dtypes: Dict[str, str],
@@ -743,6 +882,7 @@ def enforce_dtypes(
743
882
  is_dtype_numeric,
744
883
  attempt_cast_to_numeric,
745
884
  attempt_cast_to_uuid,
885
+ attempt_cast_to_bytes,
746
886
  coerce_timezone as _coerce_timezone,
747
887
  )
748
888
  pandas = mrsm.attempt_import('pandas')
@@ -773,6 +913,11 @@ def enforce_dtypes(
773
913
  for col, typ in dtypes.items()
774
914
  if typ == 'uuid'
775
915
  ]
916
+ bytes_cols = [
917
+ col
918
+ for col, typ in dtypes.items()
919
+ if typ == 'bytes'
920
+ ]
776
921
  datetime_cols = [
777
922
  col
778
923
  for col, typ in dtypes.items()
@@ -826,6 +971,17 @@ def enforce_dtypes(
826
971
  if debug:
827
972
  dprint(f"Unable to parse column '{col}' as UUID:\n{e}")
828
973
 
974
+ if bytes_cols:
975
+ if debug:
976
+ dprint(f"Checking for bytes: {bytes_cols}")
977
+ for col in bytes_cols:
978
+ if col in df.columns:
979
+ try:
980
+ df[col] = df[col].apply(attempt_cast_to_bytes)
981
+ except Exception as e:
982
+ if debug:
983
+ dprint(f"Unable to parse column '{col}' as bytes:\n{e}")
984
+
829
985
  if datetime_cols and coerce_timezone:
830
986
  if debug:
831
987
  dprint(f"Checking for datetime conversion: {datetime_cols}")
@@ -931,6 +1087,8 @@ def get_datetime_bound_from_df(
931
1087
  -------
932
1088
  The minimum or maximum datetime value in the dataframe, or `None`.
933
1089
  """
1090
+ from meerschaum.utils.dtypes import to_datetime, value_is_null
1091
+
934
1092
  if df is None:
935
1093
  return None
936
1094
  if not datetime_column:
@@ -982,9 +1140,9 @@ def get_datetime_bound_from_df(
982
1140
  dt_val = dt_val.compute()
983
1141
 
984
1142
  return (
985
- pandas.to_datetime(dt_val).to_pydatetime()
1143
+ to_datetime(dt_val, as_pydatetime=True)
986
1144
  if are_dtypes_equal(str(type(dt_val)), 'datetime')
987
- else (dt_val if dt_val is not pandas.NA else None)
1145
+ else (dt_val if not value_is_null(dt_val) else None)
988
1146
  )
989
1147
 
990
1148
  return None
@@ -1127,7 +1285,7 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
1127
1285
  for partition in ddf.partitions:
1128
1286
  try:
1129
1287
  pdf = partition.compute()
1130
- except Exception as e:
1288
+ except Exception:
1131
1289
  continue
1132
1290
  if len(pdf) > 0:
1133
1291
  return pdf
@@ -1408,12 +1566,16 @@ def to_json(
1408
1566
  A JSON string.
1409
1567
  """
1410
1568
  from meerschaum.utils.packages import import_pandas
1569
+ from meerschaum.utils.dtypes import serialize_bytes
1411
1570
  pd = import_pandas()
1412
1571
  uuid_cols = get_uuid_cols(df)
1413
- if uuid_cols and safe_copy:
1572
+ bytes_cols = get_bytes_cols(df)
1573
+ if safe_copy and bool(uuid_cols or bytes_cols):
1414
1574
  df = df.copy()
1415
1575
  for col in uuid_cols:
1416
1576
  df[col] = df[col].astype(str)
1577
+ for col in bytes_cols:
1578
+ df[col] = df[col].apply(serialize_bytes)
1417
1579
  return df.infer_objects(copy=False).fillna(pd.NA).to_json(
1418
1580
  date_format=date_format,
1419
1581
  date_unit=date_unit,
@@ -15,7 +15,19 @@ import meerschaum as mrsm
15
15
  from meerschaum.utils.typing import Dict, Union, Any
16
16
  from meerschaum.utils.warnings import warn
17
17
 
18
- MRSM_PD_DTYPES: Dict[str, str] = {
18
+ MRSM_ALIAS_DTYPES: Dict[str, str] = {
19
+ 'decimal': 'numeric',
20
+ 'number': 'numeric',
21
+ 'jsonl': 'json',
22
+ 'JSON': 'json',
23
+ 'binary': 'bytes',
24
+ 'blob': 'bytes',
25
+ 'varbinary': 'bytes',
26
+ 'bytea': 'bytes',
27
+ 'guid': 'uuid',
28
+ 'UUID': 'uuid',
29
+ }
30
+ MRSM_PD_DTYPES: Dict[Union[str, None], str] = {
19
31
  'json': 'object',
20
32
  'numeric': 'object',
21
33
  'uuid': 'object',
@@ -27,6 +39,8 @@ MRSM_PD_DTYPES: Dict[str, str] = {
27
39
  'int32': 'Int32',
28
40
  'int64': 'Int64',
29
41
  'str': 'string[python]',
42
+ 'bytes': 'object',
43
+ None: 'object',
30
44
  }
31
45
 
32
46
 
@@ -38,6 +52,10 @@ def to_pandas_dtype(dtype: str) -> str:
38
52
  if known_dtype is not None:
39
53
  return known_dtype
40
54
 
55
+ alias_dtype = MRSM_ALIAS_DTYPES.get(dtype, None)
56
+ if alias_dtype is not None:
57
+ return MRSM_PD_DTYPES[alias_dtype]
58
+
41
59
  ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
42
60
  ### treat it as a SQL db type.
43
61
  if dtype.split(' ')[0].isupper():
@@ -95,7 +113,7 @@ def are_dtypes_equal(
95
113
  try:
96
114
  if ldtype == rdtype:
97
115
  return True
98
- except Exception as e:
116
+ except Exception:
99
117
  warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
100
118
  return False
101
119
 
@@ -115,6 +133,10 @@ def are_dtypes_equal(
115
133
  if ldtype in uuid_dtypes and rdtype in uuid_dtypes:
116
134
  return True
117
135
 
136
+ bytes_dtypes = ('bytes', 'object')
137
+ if ldtype in bytes_dtypes and rdtype in bytes_dtypes:
138
+ return True
139
+
118
140
  ldtype_clean = ldtype.split('[', maxsplit=1)[0]
119
141
  rdtype_clean = rdtype.split('[', maxsplit=1)[0]
120
142
 
@@ -185,7 +207,7 @@ def attempt_cast_to_numeric(value: Any) -> Any:
185
207
  if not value_is_null(value)
186
208
  else Decimal('NaN')
187
209
  )
188
- except Exception as e:
210
+ except Exception:
189
211
  return value
190
212
 
191
213
 
@@ -201,7 +223,23 @@ def attempt_cast_to_uuid(value: Any) -> Any:
201
223
  if not value_is_null(value)
202
224
  else None
203
225
  )
204
- except Exception as e:
226
+ except Exception:
227
+ return value
228
+
229
+
230
+ def attempt_cast_to_bytes(value: Any) -> Any:
231
+ """
232
+ Given a value, attempt to coerce it into a bytestring.
233
+ """
234
+ if isinstance(value, bytes):
235
+ return value
236
+ try:
237
+ return (
238
+ deserialize_bytes_string(str(value))
239
+ if not value_is_null(value)
240
+ else None
241
+ )
242
+ except Exception:
205
243
  return value
206
244
 
207
245
 
@@ -251,7 +289,7 @@ def coerce_timezone(
251
289
  ) -> Any:
252
290
  """
253
291
  Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
254
- return a naive datetime in terms of UTC.
292
+ return a UTC timestamp (strip timezone if `strip_utc` is `True`.
255
293
  """
256
294
  if dt is None:
257
295
  return None
@@ -266,9 +304,7 @@ def coerce_timezone(
266
304
  dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__')
267
305
 
268
306
  if dt_is_series:
269
- is_dask = 'dask' in dt.__module__
270
307
  pandas = mrsm.attempt_import('pandas', lazy=False)
271
- dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
272
308
 
273
309
  if (
274
310
  pandas.api.types.is_datetime64_any_dtype(dt) and (
@@ -279,14 +315,13 @@ def coerce_timezone(
279
315
  ):
280
316
  return dt
281
317
 
282
- dt_series = (
283
- pandas.to_datetime(dt, utc=True, format='ISO8601')
284
- if dd is None
285
- else dd.to_datetime(dt, utc=True, format='ISO8601')
286
- )
318
+ dt_series = to_datetime(dt, coerce_utc=False)
287
319
  if strip_utc:
288
- if dt_series.dt.tz is not None:
289
- dt_series = dt_series.dt.tz_localize(None)
320
+ try:
321
+ if dt_series.dt.tz is not None:
322
+ dt_series = dt_series.dt.tz_localize(None)
323
+ except Exception:
324
+ pass
290
325
 
291
326
  return dt_series
292
327
 
@@ -299,3 +334,103 @@ def coerce_timezone(
299
334
  if strip_utc:
300
335
  return utc_dt.replace(tzinfo=None)
301
336
  return utc_dt
337
+
338
+
339
+ def to_datetime(dt_val: Any, as_pydatetime: bool = False, coerce_utc: bool = True) -> Any:
340
+ """
341
+ Wrap `pd.to_datetime()` and add support for out-of-bounds values.
342
+ """
343
+ pandas, dateutil_parser = mrsm.attempt_import('pandas', 'dateutil.parser', lazy=False)
344
+ is_dask = 'dask' in getattr(dt_val, '__module__', '')
345
+ dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
346
+ dt_is_series = hasattr(dt_val, 'dtype') and hasattr(dt_val, '__module__')
347
+ pd = pandas if dd is None else dd
348
+
349
+ try:
350
+ new_dt_val = pd.to_datetime(dt_val, utc=True, format='ISO8601')
351
+ if as_pydatetime:
352
+ return new_dt_val.to_pydatetime()
353
+ return new_dt_val
354
+ except (pd.errors.OutOfBoundsDatetime, ValueError):
355
+ pass
356
+
357
+ def parse(x: Any) -> Any:
358
+ try:
359
+ return dateutil_parser.parse(x)
360
+ except Exception:
361
+ return x
362
+
363
+ if dt_is_series:
364
+ new_series = dt_val.apply(parse)
365
+ if coerce_utc:
366
+ return coerce_timezone(new_series)
367
+ return new_series
368
+
369
+ new_dt_val = parse(dt_val)
370
+ if not coerce_utc:
371
+ return new_dt_val
372
+ return coerce_timezone(new_dt_val)
373
+
374
+
375
+ def serialize_bytes(data: bytes) -> str:
376
+ """
377
+ Return the given bytes as a base64-encoded string.
378
+ """
379
+ import base64
380
+ if not isinstance(data, bytes) and value_is_null(data):
381
+ return data
382
+ return base64.b64encode(data).decode('utf-8')
383
+
384
+
385
+ def deserialize_bytes_string(data: str | None, force_hex: bool = False) -> bytes | None:
386
+ """
387
+ Given a serialized ASCII string of bytes data, return the original bytes.
388
+ The input data may either be base64- or hex-encoded.
389
+
390
+ Parameters
391
+ ----------
392
+ data: str | None
393
+ The string to be deserialized into bytes.
394
+ May be base64- or hex-encoded (prefixed with `'\\x'`).
395
+
396
+ force_hex: bool = False
397
+ If `True`, treat the input string as hex-encoded.
398
+ If `data` does not begin with the prefix `'\\x'`, set `force_hex` to `True`.
399
+ This will still strip the leading `'\\x'` prefix if present.
400
+
401
+ Returns
402
+ -------
403
+ The original bytes used to produce the encoded string `data`.
404
+ """
405
+ if not isinstance(data, str) and value_is_null(data):
406
+ return data
407
+
408
+ import binascii
409
+ import base64
410
+
411
+ is_hex = force_hex or data.startswith('\\x')
412
+
413
+ if is_hex:
414
+ if data.startswith('\\x'):
415
+ data = data[2:]
416
+ return binascii.unhexlify(data)
417
+
418
+ return base64.b64decode(data)
419
+
420
+
421
+ def deserialize_base64(data: str) -> bytes:
422
+ """
423
+ Return the original bytestring from the given base64-encoded string.
424
+ """
425
+ import base64
426
+ return base64.b64decode(data)
427
+
428
+
429
+ def encode_bytes_for_bytea(data: bytes, with_prefix: bool = True) -> str | None:
430
+ """
431
+ Return the given bytes as a hex string for PostgreSQL's `BYTEA` type.
432
+ """
433
+ import binascii
434
+ if not isinstance(data, bytes) and value_is_null(data):
435
+ return data
436
+ return ('\\x' if with_prefix else '') + binascii.hexlify(data).decode('utf-8')
@@ -13,9 +13,8 @@ NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
13
13
  'mariadb': (38, 20),
14
14
  'mysql': (38, 20),
15
15
  'mssql': (28, 10),
16
- 'duckdb': (15, 3),
17
- 'sqlite': (15, 4),
18
16
  }
17
+ NUMERIC_AS_TEXT_FLAVORS = {'sqlite', 'duckdb'}
19
18
  TIMEZONE_NAIVE_FLAVORS = {'oracle', 'mysql', 'mariadb'}
20
19
 
21
20
  ### MySQL doesn't allow for casting as BIGINT, so this is a workaround.
@@ -102,6 +101,10 @@ DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
102
101
  'JSONB': 'json',
103
102
  'UUID': 'uuid',
104
103
  'UNIQUEIDENTIFIER': 'uuid',
104
+ 'BYTEA': 'bytes',
105
+ 'BLOB': 'bytes',
106
+ 'VARBINARY': 'bytes',
107
+ 'VARBINARY(MAX)': 'bytes',
105
108
  'substrings': {
106
109
  'CHAR': 'string[pyarrow]',
107
110
  'TIMESTAMP': 'datetime64[ns]',
@@ -114,6 +117,9 @@ DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
114
117
  'INT': 'int64[pyarrow]',
115
118
  'BOOL': 'bool[pyarrow]',
116
119
  'JSON': 'json',
120
+ 'BYTE': 'bytes',
121
+ 'LOB': 'bytes',
122
+ 'BINARY': 'bytes',
117
123
  },
118
124
  'default': 'object',
119
125
  }
@@ -256,8 +262,8 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
256
262
  'mysql': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mysql"]}',
257
263
  'mssql': f'NUMERIC{NUMERIC_PRECISION_FLAVORS["mssql"]}',
258
264
  'oracle': 'NUMBER',
259
- 'sqlite': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["sqlite"]}',
260
- 'duckdb': 'NUMERIC',
265
+ 'sqlite': 'TEXT',
266
+ 'duckdb': 'TEXT',
261
267
  'citus': 'NUMERIC',
262
268
  'cockroachdb': 'NUMERIC',
263
269
  'default': 'NUMERIC',
@@ -276,6 +282,19 @@ PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
276
282
  'cockroachdb': 'UUID',
277
283
  'default': 'TEXT',
278
284
  },
285
+ 'bytes': {
286
+ 'timescaledb': 'BYTEA',
287
+ 'postgresql': 'BYTEA',
288
+ 'mariadb': 'BLOB',
289
+ 'mysql': 'BLOB',
290
+ 'mssql': 'VARBINARY(MAX)',
291
+ 'oracle': 'BLOB',
292
+ 'sqlite': 'BLOB',
293
+ 'duckdb': 'BLOB',
294
+ 'citus': 'BYTEA',
295
+ 'cockroachdb': 'BYTEA',
296
+ 'default': 'BLOB',
297
+ },
279
298
  }
280
299
  PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
281
300
  'int': {
@@ -402,7 +421,7 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
402
421
  'mysql': 'Numeric',
403
422
  'mssql': 'Numeric',
404
423
  'oracle': 'Numeric',
405
- 'sqlite': 'Numeric',
424
+ 'sqlite': 'UnicodeText',
406
425
  'duckdb': 'Numeric',
407
426
  'citus': 'Numeric',
408
427
  'cockroachdb': 'Numeric',
@@ -421,6 +440,19 @@ PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
421
440
  'cockroachdb': 'Uuid',
422
441
  'default': 'Uuid',
423
442
  },
443
+ 'bytes': {
444
+ 'timescaledb': 'LargeBinary',
445
+ 'postgresql': 'LargeBinary',
446
+ 'mariadb': 'LargeBinary',
447
+ 'mysql': 'LargeBinary',
448
+ 'mssql': 'LargeBinary',
449
+ 'oracle': 'LargeBinary',
450
+ 'sqlite': 'LargeBinary',
451
+ 'duckdb': 'LargeBinary',
452
+ 'citus': 'LargeBinary',
453
+ 'cockroachdb': 'LargeBinary',
454
+ 'default': 'LargeBinary',
455
+ },
424
456
  }
425
457
 
426
458
  AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = {
@@ -502,7 +534,7 @@ def get_db_type_from_pd_type(
502
534
  """
503
535
  from meerschaum.utils.warnings import warn
504
536
  from meerschaum.utils.packages import attempt_import
505
- from meerschaum.utils.dtypes import are_dtypes_equal
537
+ from meerschaum.utils.dtypes import are_dtypes_equal, MRSM_ALIAS_DTYPES
506
538
  from meerschaum.utils.misc import parse_arguments_str
507
539
  sqlalchemy_types = attempt_import('sqlalchemy.types')
508
540
 
@@ -512,6 +544,9 @@ def get_db_type_from_pd_type(
512
544
  else PD_TO_SQLALCHEMY_DTYPES_FLAVORS
513
545
  )
514
546
 
547
+ if pd_type in MRSM_ALIAS_DTYPES:
548
+ pd_type = MRSM_ALIAS_DTYPES[pd_type]
549
+
515
550
  ### Check whether we are able to match this type (e.g. pyarrow support).
516
551
  found_db_type = False
517
552
  if pd_type not in types_registry:
@@ -568,7 +603,6 @@ def get_db_type_from_pd_type(
568
603
  return cls(*cls_args, **cls_kwargs)
569
604
 
570
605
  if 'numeric' in db_type.lower():
571
- numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
572
606
  if flavor not in NUMERIC_PRECISION_FLAVORS:
573
607
  return sqlalchemy_types.Numeric
574
608
  precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]