meerschaum 2.6.16__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. meerschaum/_internal/arguments/_parse_arguments.py +1 -1
  2. meerschaum/actions/delete.py +65 -69
  3. meerschaum/actions/edit.py +22 -2
  4. meerschaum/actions/install.py +1 -2
  5. meerschaum/actions/sync.py +2 -3
  6. meerschaum/api/routes/_pipes.py +7 -8
  7. meerschaum/config/_default.py +1 -1
  8. meerschaum/config/_paths.py +2 -1
  9. meerschaum/config/_version.py +1 -1
  10. meerschaum/connectors/api/_pipes.py +18 -21
  11. meerschaum/connectors/sql/_create_engine.py +3 -3
  12. meerschaum/connectors/sql/_instance.py +11 -12
  13. meerschaum/connectors/sql/_pipes.py +143 -91
  14. meerschaum/connectors/sql/_sql.py +43 -8
  15. meerschaum/connectors/valkey/_pipes.py +12 -1
  16. meerschaum/core/Pipe/__init__.py +23 -13
  17. meerschaum/core/Pipe/_attributes.py +25 -1
  18. meerschaum/core/Pipe/_dtypes.py +23 -16
  19. meerschaum/core/Pipe/_sync.py +59 -31
  20. meerschaum/core/Pipe/_verify.py +8 -7
  21. meerschaum/jobs/_Job.py +4 -1
  22. meerschaum/plugins/_Plugin.py +11 -14
  23. meerschaum/utils/daemon/Daemon.py +22 -15
  24. meerschaum/utils/dataframe.py +178 -16
  25. meerschaum/utils/dtypes/__init__.py +149 -14
  26. meerschaum/utils/dtypes/sql.py +41 -7
  27. meerschaum/utils/misc.py +8 -8
  28. meerschaum/utils/packages/_packages.py +1 -1
  29. meerschaum/utils/schedule.py +8 -3
  30. meerschaum/utils/sql.py +180 -100
  31. meerschaum/utils/venv/_Venv.py +4 -4
  32. meerschaum/utils/venv/__init__.py +53 -20
  33. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/METADATA +2 -2
  34. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/RECORD +40 -40
  35. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/LICENSE +0 -0
  36. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/NOTICE +0 -0
  37. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/WHEEL +0 -0
  38. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/entry_points.txt +0 -0
  39. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/top_level.txt +0 -0
  40. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/zip-safe +0 -0
@@ -432,7 +432,7 @@ class Daemon:
432
432
  + "allow_dirty_run=True)"
433
433
  )
434
434
  env = dict(os.environ)
435
- env['MRSM_NOASK'] = 'true'
435
+ env[STATIC_CONFIG['environment']['noninteractive']] = 'true'
436
436
  _launch_success_bool = venv_exec(_launch_daemon_code, debug=debug, venv=None, env=env)
437
437
  msg = (
438
438
  "Success"
@@ -465,18 +465,25 @@ class Daemon:
465
465
  self._write_stop_file('kill')
466
466
  return True, "Process has already stopped."
467
467
 
468
+ psutil = attempt_import('psutil')
468
469
  process = self.process
469
470
  try:
470
471
  process.terminate()
471
472
  process.kill()
472
473
  process.wait(timeout=timeout)
473
474
  except Exception as e:
474
- return False, f"Failed to kill job {self} with exception: {e}"
475
+ return False, f"Failed to kill job {self} ({process}) with exception: {e}"
476
+
477
+ try:
478
+ if process.status():
479
+ return False, "Failed to stop daemon '{self}' ({process})."
480
+ except psutil.NoSuchProcess:
481
+ pass
475
482
 
476
483
  if self.pid_path.exists():
477
484
  try:
478
485
  self.pid_path.unlink()
479
- except Exception as e:
486
+ except Exception:
480
487
  pass
481
488
 
482
489
  self._write_stop_file('kill')
@@ -534,7 +541,7 @@ class Daemon:
534
541
  if not timeout:
535
542
  try:
536
543
  success = self.process.status() == 'stopped'
537
- except psutil.NoSuchProcess as e:
544
+ except psutil.NoSuchProcess:
538
545
  success = True
539
546
  msg = "Success" if success else f"Failed to suspend daemon '{self.daemon_id}'."
540
547
  if success:
@@ -677,11 +684,11 @@ class Daemon:
677
684
  raise SystemExit(0)
678
685
 
679
686
  def _send_signal(
680
- self,
681
- signal_to_send,
682
- timeout: Union[float, int, None] = None,
683
- check_timeout_interval: Union[float, int, None] = None,
684
- ) -> SuccessTuple:
687
+ self,
688
+ signal_to_send,
689
+ timeout: Union[float, int, None] = None,
690
+ check_timeout_interval: Union[float, int, None] = None,
691
+ ) -> SuccessTuple:
685
692
  """Send a signal to the daemon process.
686
693
 
687
694
  Parameters
@@ -709,7 +716,7 @@ class Daemon:
709
716
  )
710
717
 
711
718
  os.kill(pid, signal_to_send)
712
- except Exception as e:
719
+ except Exception:
713
720
  return False, f"Failed to send signal {signal_to_send}:\n{traceback.format_exc()}"
714
721
 
715
722
  timeout = self.get_timeout_seconds(timeout)
@@ -727,7 +734,7 @@ class Daemon:
727
734
  time.sleep(check_timeout_interval)
728
735
 
729
736
  return False, (
730
- f"Failed to stop daemon '{self.daemon_id}' within {timeout} second"
737
+ f"Failed to stop daemon '{self.daemon_id}' (PID: {pid}) within {timeout} second"
731
738
  + ('s' if timeout != 1 else '') + '.'
732
739
  )
733
740
 
@@ -745,7 +752,7 @@ class Daemon:
745
752
  if _already_exists and not allow_dirty_run:
746
753
  error(
747
754
  f"Daemon '{self.daemon_id}' already exists. " +
748
- f"To allow this daemon to run, do one of the following:\n"
755
+ "To allow this daemon to run, do one of the following:\n"
749
756
  + " - Execute `daemon.cleanup()`.\n"
750
757
  + f" - Delete the directory '{self.path}'.\n"
751
758
  + " - Pass `allow_dirty_run=True` to `daemon.run()`.\n",
@@ -764,7 +771,7 @@ class Daemon:
764
771
  if '_process' not in self.__dict__ or self.__dict__['_process'].pid != int(pid):
765
772
  try:
766
773
  self._process = psutil.Process(int(pid))
767
- except Exception as e:
774
+ except Exception:
768
775
  if self.pid_path.exists():
769
776
  self.pid_path.unlink()
770
777
  return None
@@ -788,7 +795,7 @@ class Daemon:
788
795
  if self.pid_path.exists():
789
796
  try:
790
797
  self.pid_path.unlink()
791
- except Exception as e:
798
+ except Exception:
792
799
  pass
793
800
  return 'stopped'
794
801
 
@@ -1000,7 +1007,7 @@ class Daemon:
1000
1007
  try:
1001
1008
  with open(self.properties_path, 'r', encoding='utf-8') as file:
1002
1009
  properties = json.load(file)
1003
- except Exception as e:
1010
+ except Exception:
1004
1011
  properties = {}
1005
1012
 
1006
1013
  return properties
@@ -139,7 +139,6 @@ def filter_unseen_df(
139
139
  import functools
140
140
  import traceback
141
141
  from decimal import Decimal
142
- from uuid import UUID
143
142
  from meerschaum.utils.warnings import warn
144
143
  from meerschaum.utils.packages import import_pandas, attempt_import
145
144
  from meerschaum.utils.dtypes import (
@@ -147,6 +146,7 @@ def filter_unseen_df(
147
146
  are_dtypes_equal,
148
147
  attempt_cast_to_numeric,
149
148
  attempt_cast_to_uuid,
149
+ attempt_cast_to_bytes,
150
150
  coerce_timezone,
151
151
  )
152
152
  pd = import_pandas(debug=debug)
@@ -333,6 +333,11 @@ def filter_unseen_df(
333
333
  old_uuid_cols = get_uuid_cols(old_df)
334
334
  new_uuid_cols = get_uuid_cols(new_df)
335
335
  uuid_cols = set(new_uuid_cols + old_uuid_cols)
336
+
337
+ old_bytes_cols = get_bytes_cols(old_df)
338
+ new_bytes_cols = get_bytes_cols(new_df)
339
+ bytes_cols = set(new_bytes_cols + old_bytes_cols)
340
+
336
341
  joined_df = merge(
337
342
  new_df.infer_objects(copy=False).fillna(NA),
338
343
  old_df.infer_objects(copy=False).fillna(NA),
@@ -368,6 +373,14 @@ def filter_unseen_df(
368
373
  except Exception:
369
374
  warn(f"Unable to parse numeric column '{uuid_col}':\n{traceback.format_exc()}")
370
375
 
376
+ for bytes_col in bytes_cols:
377
+ if bytes_col not in delta_df.columns:
378
+ continue
379
+ try:
380
+ delta_df[bytes_col] = delta_df[bytes_col].apply(attempt_cast_to_bytes)
381
+ except Exception:
382
+ warn(f"Unable to parse bytes column '{bytes_col}':\n{traceback.format_exc()}")
383
+
371
384
  return delta_df
372
385
 
373
386
 
@@ -429,6 +442,7 @@ def parse_df_datetimes(
429
442
  from meerschaum.utils.debug import dprint
430
443
  from meerschaum.utils.warnings import warn
431
444
  from meerschaum.utils.misc import items_str
445
+ from meerschaum.utils.dtypes import to_datetime
432
446
  import traceback
433
447
  pd = import_pandas()
434
448
  pandas = attempt_import('pandas')
@@ -480,7 +494,7 @@ def parse_df_datetimes(
480
494
  ### skip parsing if DataFrame is empty
481
495
  if len(pdf) == 0:
482
496
  if debug:
483
- dprint(f"df is empty. Returning original DataFrame without casting datetime columns...")
497
+ dprint("df is empty. Returning original DataFrame without casting datetime columns...")
484
498
  return df
485
499
 
486
500
  ignore_cols = set(
@@ -494,8 +508,8 @@ def parse_df_datetimes(
494
508
 
495
509
  if len(cols_to_inspect) == 0:
496
510
  if debug:
497
- dprint(f"All columns are ignored, skipping datetime detection...")
498
- return df.fillna(pandas.NA)
511
+ dprint("All columns are ignored, skipping datetime detection...")
512
+ return df.infer_objects(copy=False).fillna(pandas.NA)
499
513
 
500
514
  ### apply regex to columns to determine which are ISO datetimes
501
515
  iso_dt_regex = r'\d{4}-\d{2}-\d{2}.\d{2}\:\d{2}\:\d+'
@@ -508,21 +522,17 @@ def parse_df_datetimes(
508
522
  if not datetime_cols:
509
523
  if debug:
510
524
  dprint("No columns detected as datetimes, returning...")
511
- return df.fillna(pandas.NA)
525
+ return df.infer_objects(copy=False).fillna(pandas.NA)
512
526
 
513
527
  if debug:
514
528
  dprint("Converting columns to datetimes: " + str(datetime_cols))
515
529
 
516
530
  try:
517
531
  if not using_dask:
518
- df[datetime_cols] = df[datetime_cols].apply(
519
- pd.to_datetime,
520
- utc=True,
521
- format='ISO8601',
522
- )
532
+ df[datetime_cols] = df[datetime_cols].apply(to_datetime)
523
533
  else:
524
534
  df[datetime_cols] = df[datetime_cols].apply(
525
- pd.to_datetime,
535
+ to_datetime,
526
536
  utc=True,
527
537
  axis=1,
528
538
  meta={
@@ -665,7 +675,7 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
665
675
 
666
676
  Returns
667
677
  -------
668
- A list of columns to treat as numerics.
678
+ A list of columns to treat as UUIDs.
669
679
  """
670
680
  if df is None:
671
681
  return []
@@ -692,6 +702,135 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
692
702
  ]
693
703
 
694
704
 
705
+ def get_datetime_cols(
706
+ df: 'pd.DataFrame',
707
+ timezone_aware: bool = True,
708
+ timezone_naive: bool = True,
709
+ ) -> List[str]:
710
+ """
711
+ Get the columns which contain `datetime` or `Timestamp` objects from a Pandas DataFrame.
712
+
713
+ Parameters
714
+ ----------
715
+ df: pd.DataFrame
716
+ The DataFrame which may contain `datetime` or `Timestamp` objects.
717
+
718
+ timezone_aware: bool, default True
719
+ If `True`, include timezone-aware datetime columns.
720
+
721
+ timezone_naive: bool, default True
722
+ If `True`, include timezone-naive datetime columns.
723
+
724
+ Returns
725
+ -------
726
+ A list of columns to treat as datetimes.
727
+ """
728
+ if not timezone_aware and not timezone_naive:
729
+ raise ValueError("`timezone_aware` and `timezone_naive` cannot both be `False`.")
730
+
731
+ if df is None:
732
+ return []
733
+
734
+ from datetime import datetime
735
+ from meerschaum.utils.dtypes import are_dtypes_equal
736
+ is_dask = 'dask' in df.__module__
737
+ if is_dask:
738
+ df = get_first_valid_dask_partition(df)
739
+
740
+ known_dt_cols = [
741
+ col
742
+ for col, typ in df.dtypes.items()
743
+ if are_dtypes_equal('datetime', str(typ))
744
+ ]
745
+
746
+ if len(df) == 0:
747
+ return known_dt_cols
748
+
749
+ cols_indices = {
750
+ col: df[col].first_valid_index()
751
+ for col in df.columns
752
+ if col not in known_dt_cols
753
+ }
754
+ pydt_cols = [
755
+ col
756
+ for col, ix in cols_indices.items()
757
+ if (
758
+ ix is not None
759
+ and
760
+ isinstance(df.loc[ix][col], datetime)
761
+ )
762
+ ]
763
+ dt_cols_set = set(known_dt_cols + pydt_cols)
764
+ all_dt_cols = [
765
+ col
766
+ for col in df.columns
767
+ if col in dt_cols_set
768
+ ]
769
+ if timezone_aware and timezone_naive:
770
+ return all_dt_cols
771
+
772
+ known_timezone_aware_dt_cols = [
773
+ col
774
+ for col in known_dt_cols
775
+ if getattr(df[col], 'tz', None) is not None
776
+ ]
777
+ timezone_aware_pydt_cols = [
778
+ col
779
+ for col in pydt_cols
780
+ if df.loc[cols_indices[col]][col].tzinfo is not None
781
+ ]
782
+ timezone_aware_dt_cols_set = set(known_timezone_aware_dt_cols + timezone_aware_pydt_cols)
783
+ if timezone_aware:
784
+ return [
785
+ col
786
+ for col in all_dt_cols
787
+ if col in timezone_aware_pydt_cols
788
+ ]
789
+
790
+ return [
791
+ col
792
+ for col in all_dt_cols
793
+ if col not in timezone_aware_dt_cols_set
794
+ ]
795
+
796
+
797
+ def get_bytes_cols(df: 'pd.DataFrame') -> List[str]:
798
+ """
799
+ Get the columns which contain bytes strings from a Pandas DataFrame.
800
+
801
+ Parameters
802
+ ----------
803
+ df: pd.DataFrame
804
+ The DataFrame which may contain bytes strings.
805
+
806
+ Returns
807
+ -------
808
+ A list of columns to treat as bytes.
809
+ """
810
+ if df is None:
811
+ return []
812
+ is_dask = 'dask' in df.__module__
813
+ if is_dask:
814
+ df = get_first_valid_dask_partition(df)
815
+
816
+ if len(df) == 0:
817
+ return []
818
+
819
+ cols_indices = {
820
+ col: df[col].first_valid_index()
821
+ for col in df.columns
822
+ }
823
+ return [
824
+ col
825
+ for col, ix in cols_indices.items()
826
+ if (
827
+ ix is not None
828
+ and
829
+ isinstance(df.loc[ix][col], bytes)
830
+ )
831
+ ]
832
+
833
+
695
834
  def enforce_dtypes(
696
835
  df: 'pd.DataFrame',
697
836
  dtypes: Dict[str, str],
@@ -743,6 +882,7 @@ def enforce_dtypes(
743
882
  is_dtype_numeric,
744
883
  attempt_cast_to_numeric,
745
884
  attempt_cast_to_uuid,
885
+ attempt_cast_to_bytes,
746
886
  coerce_timezone as _coerce_timezone,
747
887
  )
748
888
  pandas = mrsm.attempt_import('pandas')
@@ -773,6 +913,11 @@ def enforce_dtypes(
773
913
  for col, typ in dtypes.items()
774
914
  if typ == 'uuid'
775
915
  ]
916
+ bytes_cols = [
917
+ col
918
+ for col, typ in dtypes.items()
919
+ if typ == 'bytes'
920
+ ]
776
921
  datetime_cols = [
777
922
  col
778
923
  for col, typ in dtypes.items()
@@ -826,6 +971,17 @@ def enforce_dtypes(
826
971
  if debug:
827
972
  dprint(f"Unable to parse column '{col}' as UUID:\n{e}")
828
973
 
974
+ if bytes_cols:
975
+ if debug:
976
+ dprint(f"Checking for bytes: {bytes_cols}")
977
+ for col in bytes_cols:
978
+ if col in df.columns:
979
+ try:
980
+ df[col] = df[col].apply(attempt_cast_to_bytes)
981
+ except Exception as e:
982
+ if debug:
983
+ dprint(f"Unable to parse column '{col}' as bytes:\n{e}")
984
+
829
985
  if datetime_cols and coerce_timezone:
830
986
  if debug:
831
987
  dprint(f"Checking for datetime conversion: {datetime_cols}")
@@ -931,6 +1087,8 @@ def get_datetime_bound_from_df(
931
1087
  -------
932
1088
  The minimum or maximum datetime value in the dataframe, or `None`.
933
1089
  """
1090
+ from meerschaum.utils.dtypes import to_datetime, value_is_null
1091
+
934
1092
  if df is None:
935
1093
  return None
936
1094
  if not datetime_column:
@@ -982,9 +1140,9 @@ def get_datetime_bound_from_df(
982
1140
  dt_val = dt_val.compute()
983
1141
 
984
1142
  return (
985
- pandas.to_datetime(dt_val).to_pydatetime()
1143
+ to_datetime(dt_val, as_pydatetime=True)
986
1144
  if are_dtypes_equal(str(type(dt_val)), 'datetime')
987
- else (dt_val if dt_val is not pandas.NA else None)
1145
+ else (dt_val if not value_is_null(dt_val) else None)
988
1146
  )
989
1147
 
990
1148
  return None
@@ -1127,7 +1285,7 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
1127
1285
  for partition in ddf.partitions:
1128
1286
  try:
1129
1287
  pdf = partition.compute()
1130
- except Exception as e:
1288
+ except Exception:
1131
1289
  continue
1132
1290
  if len(pdf) > 0:
1133
1291
  return pdf
@@ -1408,12 +1566,16 @@ def to_json(
1408
1566
  A JSON string.
1409
1567
  """
1410
1568
  from meerschaum.utils.packages import import_pandas
1569
+ from meerschaum.utils.dtypes import serialize_bytes
1411
1570
  pd = import_pandas()
1412
1571
  uuid_cols = get_uuid_cols(df)
1413
- if uuid_cols and safe_copy:
1572
+ bytes_cols = get_bytes_cols(df)
1573
+ if safe_copy and bool(uuid_cols or bytes_cols):
1414
1574
  df = df.copy()
1415
1575
  for col in uuid_cols:
1416
1576
  df[col] = df[col].astype(str)
1577
+ for col in bytes_cols:
1578
+ df[col] = df[col].apply(serialize_bytes)
1417
1579
  return df.infer_objects(copy=False).fillna(pd.NA).to_json(
1418
1580
  date_format=date_format,
1419
1581
  date_unit=date_unit,
@@ -15,7 +15,19 @@ import meerschaum as mrsm
15
15
  from meerschaum.utils.typing import Dict, Union, Any
16
16
  from meerschaum.utils.warnings import warn
17
17
 
18
- MRSM_PD_DTYPES: Dict[str, str] = {
18
+ MRSM_ALIAS_DTYPES: Dict[str, str] = {
19
+ 'decimal': 'numeric',
20
+ 'number': 'numeric',
21
+ 'jsonl': 'json',
22
+ 'JSON': 'json',
23
+ 'binary': 'bytes',
24
+ 'blob': 'bytes',
25
+ 'varbinary': 'bytes',
26
+ 'bytea': 'bytes',
27
+ 'guid': 'uuid',
28
+ 'UUID': 'uuid',
29
+ }
30
+ MRSM_PD_DTYPES: Dict[Union[str, None], str] = {
19
31
  'json': 'object',
20
32
  'numeric': 'object',
21
33
  'uuid': 'object',
@@ -27,6 +39,8 @@ MRSM_PD_DTYPES: Dict[str, str] = {
27
39
  'int32': 'Int32',
28
40
  'int64': 'Int64',
29
41
  'str': 'string[python]',
42
+ 'bytes': 'object',
43
+ None: 'object',
30
44
  }
31
45
 
32
46
 
@@ -38,6 +52,10 @@ def to_pandas_dtype(dtype: str) -> str:
38
52
  if known_dtype is not None:
39
53
  return known_dtype
40
54
 
55
+ alias_dtype = MRSM_ALIAS_DTYPES.get(dtype, None)
56
+ if alias_dtype is not None:
57
+ return MRSM_PD_DTYPES[alias_dtype]
58
+
41
59
  ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
42
60
  ### treat it as a SQL db type.
43
61
  if dtype.split(' ')[0].isupper():
@@ -95,7 +113,7 @@ def are_dtypes_equal(
95
113
  try:
96
114
  if ldtype == rdtype:
97
115
  return True
98
- except Exception as e:
116
+ except Exception:
99
117
  warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
100
118
  return False
101
119
 
@@ -115,6 +133,10 @@ def are_dtypes_equal(
115
133
  if ldtype in uuid_dtypes and rdtype in uuid_dtypes:
116
134
  return True
117
135
 
136
+ bytes_dtypes = ('bytes', 'object')
137
+ if ldtype in bytes_dtypes and rdtype in bytes_dtypes:
138
+ return True
139
+
118
140
  ldtype_clean = ldtype.split('[', maxsplit=1)[0]
119
141
  rdtype_clean = rdtype.split('[', maxsplit=1)[0]
120
142
 
@@ -185,7 +207,7 @@ def attempt_cast_to_numeric(value: Any) -> Any:
185
207
  if not value_is_null(value)
186
208
  else Decimal('NaN')
187
209
  )
188
- except Exception as e:
210
+ except Exception:
189
211
  return value
190
212
 
191
213
 
@@ -201,7 +223,23 @@ def attempt_cast_to_uuid(value: Any) -> Any:
201
223
  if not value_is_null(value)
202
224
  else None
203
225
  )
204
- except Exception as e:
226
+ except Exception:
227
+ return value
228
+
229
+
230
+ def attempt_cast_to_bytes(value: Any) -> Any:
231
+ """
232
+ Given a value, attempt to coerce it into a bytestring.
233
+ """
234
+ if isinstance(value, bytes):
235
+ return value
236
+ try:
237
+ return (
238
+ deserialize_bytes_string(str(value))
239
+ if not value_is_null(value)
240
+ else None
241
+ )
242
+ except Exception:
205
243
  return value
206
244
 
207
245
 
@@ -251,7 +289,7 @@ def coerce_timezone(
251
289
  ) -> Any:
252
290
  """
253
291
  Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
254
- return a naive datetime in terms of UTC.
292
+ return a UTC timestamp (strip timezone if `strip_utc` is `True`.
255
293
  """
256
294
  if dt is None:
257
295
  return None
@@ -266,9 +304,7 @@ def coerce_timezone(
266
304
  dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__')
267
305
 
268
306
  if dt_is_series:
269
- is_dask = 'dask' in dt.__module__
270
307
  pandas = mrsm.attempt_import('pandas', lazy=False)
271
- dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
272
308
 
273
309
  if (
274
310
  pandas.api.types.is_datetime64_any_dtype(dt) and (
@@ -279,14 +315,13 @@ def coerce_timezone(
279
315
  ):
280
316
  return dt
281
317
 
282
- dt_series = (
283
- pandas.to_datetime(dt, utc=True, format='ISO8601')
284
- if dd is None
285
- else dd.to_datetime(dt, utc=True, format='ISO8601')
286
- )
318
+ dt_series = to_datetime(dt, coerce_utc=False)
287
319
  if strip_utc:
288
- if dt_series.dt.tz is not None:
289
- dt_series = dt_series.dt.tz_localize(None)
320
+ try:
321
+ if dt_series.dt.tz is not None:
322
+ dt_series = dt_series.dt.tz_localize(None)
323
+ except Exception:
324
+ pass
290
325
 
291
326
  return dt_series
292
327
 
@@ -299,3 +334,103 @@ def coerce_timezone(
299
334
  if strip_utc:
300
335
  return utc_dt.replace(tzinfo=None)
301
336
  return utc_dt
337
+
338
+
339
+ def to_datetime(dt_val: Any, as_pydatetime: bool = False, coerce_utc: bool = True) -> Any:
340
+ """
341
+ Wrap `pd.to_datetime()` and add support for out-of-bounds values.
342
+ """
343
+ pandas, dateutil_parser = mrsm.attempt_import('pandas', 'dateutil.parser', lazy=False)
344
+ is_dask = 'dask' in getattr(dt_val, '__module__', '')
345
+ dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
346
+ dt_is_series = hasattr(dt_val, 'dtype') and hasattr(dt_val, '__module__')
347
+ pd = pandas if dd is None else dd
348
+
349
+ try:
350
+ new_dt_val = pd.to_datetime(dt_val, utc=True, format='ISO8601')
351
+ if as_pydatetime:
352
+ return new_dt_val.to_pydatetime()
353
+ return new_dt_val
354
+ except (pd.errors.OutOfBoundsDatetime, ValueError):
355
+ pass
356
+
357
+ def parse(x: Any) -> Any:
358
+ try:
359
+ return dateutil_parser.parse(x)
360
+ except Exception:
361
+ return x
362
+
363
+ if dt_is_series:
364
+ new_series = dt_val.apply(parse)
365
+ if coerce_utc:
366
+ return coerce_timezone(new_series)
367
+ return new_series
368
+
369
+ new_dt_val = parse(dt_val)
370
+ if not coerce_utc:
371
+ return new_dt_val
372
+ return coerce_timezone(new_dt_val)
373
+
374
+
375
+ def serialize_bytes(data: bytes) -> str:
376
+ """
377
+ Return the given bytes as a base64-encoded string.
378
+ """
379
+ import base64
380
+ if not isinstance(data, bytes) and value_is_null(data):
381
+ return data
382
+ return base64.b64encode(data).decode('utf-8')
383
+
384
+
385
+ def deserialize_bytes_string(data: str | None, force_hex: bool = False) -> bytes | None:
386
+ """
387
+ Given a serialized ASCII string of bytes data, return the original bytes.
388
+ The input data may either be base64- or hex-encoded.
389
+
390
+ Parameters
391
+ ----------
392
+ data: str | None
393
+ The string to be deserialized into bytes.
394
+ May be base64- or hex-encoded (prefixed with `'\\x'`).
395
+
396
+ force_hex: bool = False
397
+ If `True`, treat the input string as hex-encoded.
398
+ If `data` does not begin with the prefix `'\\x'`, set `force_hex` to `True`.
399
+ This will still strip the leading `'\\x'` prefix if present.
400
+
401
+ Returns
402
+ -------
403
+ The original bytes used to produce the encoded string `data`.
404
+ """
405
+ if not isinstance(data, str) and value_is_null(data):
406
+ return data
407
+
408
+ import binascii
409
+ import base64
410
+
411
+ is_hex = force_hex or data.startswith('\\x')
412
+
413
+ if is_hex:
414
+ if data.startswith('\\x'):
415
+ data = data[2:]
416
+ return binascii.unhexlify(data)
417
+
418
+ return base64.b64decode(data)
419
+
420
+
421
+ def deserialize_base64(data: str) -> bytes:
422
+ """
423
+ Return the original bytestring from the given base64-encoded string.
424
+ """
425
+ import base64
426
+ return base64.b64decode(data)
427
+
428
+
429
+ def encode_bytes_for_bytea(data: bytes, with_prefix: bool = True) -> str | None:
430
+ """
431
+ Return the given bytes as a hex string for PostgreSQL's `BYTEA` type.
432
+ """
433
+ import binascii
434
+ if not isinstance(data, bytes) and value_is_null(data):
435
+ return data
436
+ return ('\\x' if with_prefix else '') + binascii.hexlify(data).decode('utf-8')