meerschaum 2.6.16__py3-none-any.whl → 2.7.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. meerschaum/_internal/arguments/_parse_arguments.py +1 -1
  2. meerschaum/actions/delete.py +65 -69
  3. meerschaum/actions/edit.py +22 -2
  4. meerschaum/actions/install.py +1 -2
  5. meerschaum/actions/sync.py +2 -3
  6. meerschaum/api/routes/_pipes.py +7 -8
  7. meerschaum/config/_default.py +1 -1
  8. meerschaum/config/_paths.py +2 -1
  9. meerschaum/config/_version.py +1 -1
  10. meerschaum/connectors/api/_pipes.py +18 -21
  11. meerschaum/connectors/sql/_create_engine.py +3 -3
  12. meerschaum/connectors/sql/_instance.py +11 -12
  13. meerschaum/connectors/sql/_pipes.py +143 -91
  14. meerschaum/connectors/sql/_sql.py +43 -8
  15. meerschaum/connectors/valkey/_pipes.py +12 -1
  16. meerschaum/core/Pipe/__init__.py +23 -13
  17. meerschaum/core/Pipe/_attributes.py +25 -1
  18. meerschaum/core/Pipe/_dtypes.py +23 -16
  19. meerschaum/core/Pipe/_sync.py +59 -31
  20. meerschaum/core/Pipe/_verify.py +8 -7
  21. meerschaum/jobs/_Job.py +4 -1
  22. meerschaum/plugins/_Plugin.py +11 -14
  23. meerschaum/utils/daemon/Daemon.py +22 -15
  24. meerschaum/utils/dataframe.py +178 -16
  25. meerschaum/utils/dtypes/__init__.py +149 -14
  26. meerschaum/utils/dtypes/sql.py +41 -7
  27. meerschaum/utils/misc.py +8 -8
  28. meerschaum/utils/packages/_packages.py +1 -1
  29. meerschaum/utils/schedule.py +8 -3
  30. meerschaum/utils/sql.py +180 -100
  31. meerschaum/utils/venv/_Venv.py +4 -4
  32. meerschaum/utils/venv/__init__.py +53 -20
  33. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/METADATA +2 -2
  34. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/RECORD +40 -40
  35. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/LICENSE +0 -0
  36. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/NOTICE +0 -0
  37. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/WHEEL +0 -0
  38. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/entry_points.txt +0 -0
  39. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/top_level.txt +0 -0
  40. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0.dist-info}/zip-safe +0 -0
@@ -432,7 +432,7 @@ class Daemon:
432
432
  + "allow_dirty_run=True)"
433
433
  )
434
434
  env = dict(os.environ)
435
- env['MRSM_NOASK'] = 'true'
435
+ env[STATIC_CONFIG['environment']['noninteractive']] = 'true'
436
436
  _launch_success_bool = venv_exec(_launch_daemon_code, debug=debug, venv=None, env=env)
437
437
  msg = (
438
438
  "Success"
@@ -465,18 +465,25 @@ class Daemon:
465
465
  self._write_stop_file('kill')
466
466
  return True, "Process has already stopped."
467
467
 
468
+ psutil = attempt_import('psutil')
468
469
  process = self.process
469
470
  try:
470
471
  process.terminate()
471
472
  process.kill()
472
473
  process.wait(timeout=timeout)
473
474
  except Exception as e:
474
- return False, f"Failed to kill job {self} with exception: {e}"
475
+ return False, f"Failed to kill job {self} ({process}) with exception: {e}"
476
+
477
+ try:
478
+ if process.status():
479
+ return False, "Failed to stop daemon '{self}' ({process})."
480
+ except psutil.NoSuchProcess:
481
+ pass
475
482
 
476
483
  if self.pid_path.exists():
477
484
  try:
478
485
  self.pid_path.unlink()
479
- except Exception as e:
486
+ except Exception:
480
487
  pass
481
488
 
482
489
  self._write_stop_file('kill')
@@ -534,7 +541,7 @@ class Daemon:
534
541
  if not timeout:
535
542
  try:
536
543
  success = self.process.status() == 'stopped'
537
- except psutil.NoSuchProcess as e:
544
+ except psutil.NoSuchProcess:
538
545
  success = True
539
546
  msg = "Success" if success else f"Failed to suspend daemon '{self.daemon_id}'."
540
547
  if success:
@@ -677,11 +684,11 @@ class Daemon:
677
684
  raise SystemExit(0)
678
685
 
679
686
  def _send_signal(
680
- self,
681
- signal_to_send,
682
- timeout: Union[float, int, None] = None,
683
- check_timeout_interval: Union[float, int, None] = None,
684
- ) -> SuccessTuple:
687
+ self,
688
+ signal_to_send,
689
+ timeout: Union[float, int, None] = None,
690
+ check_timeout_interval: Union[float, int, None] = None,
691
+ ) -> SuccessTuple:
685
692
  """Send a signal to the daemon process.
686
693
 
687
694
  Parameters
@@ -709,7 +716,7 @@ class Daemon:
709
716
  )
710
717
 
711
718
  os.kill(pid, signal_to_send)
712
- except Exception as e:
719
+ except Exception:
713
720
  return False, f"Failed to send signal {signal_to_send}:\n{traceback.format_exc()}"
714
721
 
715
722
  timeout = self.get_timeout_seconds(timeout)
@@ -727,7 +734,7 @@ class Daemon:
727
734
  time.sleep(check_timeout_interval)
728
735
 
729
736
  return False, (
730
- f"Failed to stop daemon '{self.daemon_id}' within {timeout} second"
737
+ f"Failed to stop daemon '{self.daemon_id}' (PID: {pid}) within {timeout} second"
731
738
  + ('s' if timeout != 1 else '') + '.'
732
739
  )
733
740
 
@@ -745,7 +752,7 @@ class Daemon:
745
752
  if _already_exists and not allow_dirty_run:
746
753
  error(
747
754
  f"Daemon '{self.daemon_id}' already exists. " +
748
- f"To allow this daemon to run, do one of the following:\n"
755
+ "To allow this daemon to run, do one of the following:\n"
749
756
  + " - Execute `daemon.cleanup()`.\n"
750
757
  + f" - Delete the directory '{self.path}'.\n"
751
758
  + " - Pass `allow_dirty_run=True` to `daemon.run()`.\n",
@@ -764,7 +771,7 @@ class Daemon:
764
771
  if '_process' not in self.__dict__ or self.__dict__['_process'].pid != int(pid):
765
772
  try:
766
773
  self._process = psutil.Process(int(pid))
767
- except Exception as e:
774
+ except Exception:
768
775
  if self.pid_path.exists():
769
776
  self.pid_path.unlink()
770
777
  return None
@@ -788,7 +795,7 @@ class Daemon:
788
795
  if self.pid_path.exists():
789
796
  try:
790
797
  self.pid_path.unlink()
791
- except Exception as e:
798
+ except Exception:
792
799
  pass
793
800
  return 'stopped'
794
801
 
@@ -1000,7 +1007,7 @@ class Daemon:
1000
1007
  try:
1001
1008
  with open(self.properties_path, 'r', encoding='utf-8') as file:
1002
1009
  properties = json.load(file)
1003
- except Exception as e:
1010
+ except Exception:
1004
1011
  properties = {}
1005
1012
 
1006
1013
  return properties
@@ -139,7 +139,6 @@ def filter_unseen_df(
139
139
  import functools
140
140
  import traceback
141
141
  from decimal import Decimal
142
- from uuid import UUID
143
142
  from meerschaum.utils.warnings import warn
144
143
  from meerschaum.utils.packages import import_pandas, attempt_import
145
144
  from meerschaum.utils.dtypes import (
@@ -147,6 +146,7 @@ def filter_unseen_df(
147
146
  are_dtypes_equal,
148
147
  attempt_cast_to_numeric,
149
148
  attempt_cast_to_uuid,
149
+ attempt_cast_to_bytes,
150
150
  coerce_timezone,
151
151
  )
152
152
  pd = import_pandas(debug=debug)
@@ -333,6 +333,11 @@ def filter_unseen_df(
333
333
  old_uuid_cols = get_uuid_cols(old_df)
334
334
  new_uuid_cols = get_uuid_cols(new_df)
335
335
  uuid_cols = set(new_uuid_cols + old_uuid_cols)
336
+
337
+ old_bytes_cols = get_bytes_cols(old_df)
338
+ new_bytes_cols = get_bytes_cols(new_df)
339
+ bytes_cols = set(new_bytes_cols + old_bytes_cols)
340
+
336
341
  joined_df = merge(
337
342
  new_df.infer_objects(copy=False).fillna(NA),
338
343
  old_df.infer_objects(copy=False).fillna(NA),
@@ -368,6 +373,14 @@ def filter_unseen_df(
368
373
  except Exception:
369
374
  warn(f"Unable to parse numeric column '{uuid_col}':\n{traceback.format_exc()}")
370
375
 
376
+ for bytes_col in bytes_cols:
377
+ if bytes_col not in delta_df.columns:
378
+ continue
379
+ try:
380
+ delta_df[bytes_col] = delta_df[bytes_col].apply(attempt_cast_to_bytes)
381
+ except Exception:
382
+ warn(f"Unable to parse bytes column '{bytes_col}':\n{traceback.format_exc()}")
383
+
371
384
  return delta_df
372
385
 
373
386
 
@@ -429,6 +442,7 @@ def parse_df_datetimes(
429
442
  from meerschaum.utils.debug import dprint
430
443
  from meerschaum.utils.warnings import warn
431
444
  from meerschaum.utils.misc import items_str
445
+ from meerschaum.utils.dtypes import to_datetime
432
446
  import traceback
433
447
  pd = import_pandas()
434
448
  pandas = attempt_import('pandas')
@@ -480,7 +494,7 @@ def parse_df_datetimes(
480
494
  ### skip parsing if DataFrame is empty
481
495
  if len(pdf) == 0:
482
496
  if debug:
483
- dprint(f"df is empty. Returning original DataFrame without casting datetime columns...")
497
+ dprint("df is empty. Returning original DataFrame without casting datetime columns...")
484
498
  return df
485
499
 
486
500
  ignore_cols = set(
@@ -494,8 +508,8 @@ def parse_df_datetimes(
494
508
 
495
509
  if len(cols_to_inspect) == 0:
496
510
  if debug:
497
- dprint(f"All columns are ignored, skipping datetime detection...")
498
- return df.fillna(pandas.NA)
511
+ dprint("All columns are ignored, skipping datetime detection...")
512
+ return df.infer_objects(copy=False).fillna(pandas.NA)
499
513
 
500
514
  ### apply regex to columns to determine which are ISO datetimes
501
515
  iso_dt_regex = r'\d{4}-\d{2}-\d{2}.\d{2}\:\d{2}\:\d+'
@@ -508,21 +522,17 @@ def parse_df_datetimes(
508
522
  if not datetime_cols:
509
523
  if debug:
510
524
  dprint("No columns detected as datetimes, returning...")
511
- return df.fillna(pandas.NA)
525
+ return df.infer_objects(copy=False).fillna(pandas.NA)
512
526
 
513
527
  if debug:
514
528
  dprint("Converting columns to datetimes: " + str(datetime_cols))
515
529
 
516
530
  try:
517
531
  if not using_dask:
518
- df[datetime_cols] = df[datetime_cols].apply(
519
- pd.to_datetime,
520
- utc=True,
521
- format='ISO8601',
522
- )
532
+ df[datetime_cols] = df[datetime_cols].apply(to_datetime)
523
533
  else:
524
534
  df[datetime_cols] = df[datetime_cols].apply(
525
- pd.to_datetime,
535
+ to_datetime,
526
536
  utc=True,
527
537
  axis=1,
528
538
  meta={
@@ -665,7 +675,7 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
665
675
 
666
676
  Returns
667
677
  -------
668
- A list of columns to treat as numerics.
678
+ A list of columns to treat as UUIDs.
669
679
  """
670
680
  if df is None:
671
681
  return []
@@ -692,6 +702,135 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
692
702
  ]
693
703
 
694
704
 
705
+ def get_datetime_cols(
706
+ df: 'pd.DataFrame',
707
+ timezone_aware: bool = True,
708
+ timezone_naive: bool = True,
709
+ ) -> List[str]:
710
+ """
711
+ Get the columns which contain `datetime` or `Timestamp` objects from a Pandas DataFrame.
712
+
713
+ Parameters
714
+ ----------
715
+ df: pd.DataFrame
716
+ The DataFrame which may contain `datetime` or `Timestamp` objects.
717
+
718
+ timezone_aware: bool, default True
719
+ If `True`, include timezone-aware datetime columns.
720
+
721
+ timezone_naive: bool, default True
722
+ If `True`, include timezone-naive datetime columns.
723
+
724
+ Returns
725
+ -------
726
+ A list of columns to treat as datetimes.
727
+ """
728
+ if not timezone_aware and not timezone_naive:
729
+ raise ValueError("`timezone_aware` and `timezone_naive` cannot both be `False`.")
730
+
731
+ if df is None:
732
+ return []
733
+
734
+ from datetime import datetime
735
+ from meerschaum.utils.dtypes import are_dtypes_equal
736
+ is_dask = 'dask' in df.__module__
737
+ if is_dask:
738
+ df = get_first_valid_dask_partition(df)
739
+
740
+ known_dt_cols = [
741
+ col
742
+ for col, typ in df.dtypes.items()
743
+ if are_dtypes_equal('datetime', str(typ))
744
+ ]
745
+
746
+ if len(df) == 0:
747
+ return known_dt_cols
748
+
749
+ cols_indices = {
750
+ col: df[col].first_valid_index()
751
+ for col in df.columns
752
+ if col not in known_dt_cols
753
+ }
754
+ pydt_cols = [
755
+ col
756
+ for col, ix in cols_indices.items()
757
+ if (
758
+ ix is not None
759
+ and
760
+ isinstance(df.loc[ix][col], datetime)
761
+ )
762
+ ]
763
+ dt_cols_set = set(known_dt_cols + pydt_cols)
764
+ all_dt_cols = [
765
+ col
766
+ for col in df.columns
767
+ if col in dt_cols_set
768
+ ]
769
+ if timezone_aware and timezone_naive:
770
+ return all_dt_cols
771
+
772
+ known_timezone_aware_dt_cols = [
773
+ col
774
+ for col in known_dt_cols
775
+ if getattr(df[col], 'tz', None) is not None
776
+ ]
777
+ timezone_aware_pydt_cols = [
778
+ col
779
+ for col in pydt_cols
780
+ if df.loc[cols_indices[col]][col].tzinfo is not None
781
+ ]
782
+ timezone_aware_dt_cols_set = set(known_timezone_aware_dt_cols + timezone_aware_pydt_cols)
783
+ if timezone_aware:
784
+ return [
785
+ col
786
+ for col in all_dt_cols
787
+ if col in timezone_aware_pydt_cols
788
+ ]
789
+
790
+ return [
791
+ col
792
+ for col in all_dt_cols
793
+ if col not in timezone_aware_dt_cols_set
794
+ ]
795
+
796
+
797
+ def get_bytes_cols(df: 'pd.DataFrame') -> List[str]:
798
+ """
799
+ Get the columns which contain bytes strings from a Pandas DataFrame.
800
+
801
+ Parameters
802
+ ----------
803
+ df: pd.DataFrame
804
+ The DataFrame which may contain bytes strings.
805
+
806
+ Returns
807
+ -------
808
+ A list of columns to treat as bytes.
809
+ """
810
+ if df is None:
811
+ return []
812
+ is_dask = 'dask' in df.__module__
813
+ if is_dask:
814
+ df = get_first_valid_dask_partition(df)
815
+
816
+ if len(df) == 0:
817
+ return []
818
+
819
+ cols_indices = {
820
+ col: df[col].first_valid_index()
821
+ for col in df.columns
822
+ }
823
+ return [
824
+ col
825
+ for col, ix in cols_indices.items()
826
+ if (
827
+ ix is not None
828
+ and
829
+ isinstance(df.loc[ix][col], bytes)
830
+ )
831
+ ]
832
+
833
+
695
834
  def enforce_dtypes(
696
835
  df: 'pd.DataFrame',
697
836
  dtypes: Dict[str, str],
@@ -743,6 +882,7 @@ def enforce_dtypes(
743
882
  is_dtype_numeric,
744
883
  attempt_cast_to_numeric,
745
884
  attempt_cast_to_uuid,
885
+ attempt_cast_to_bytes,
746
886
  coerce_timezone as _coerce_timezone,
747
887
  )
748
888
  pandas = mrsm.attempt_import('pandas')
@@ -773,6 +913,11 @@ def enforce_dtypes(
773
913
  for col, typ in dtypes.items()
774
914
  if typ == 'uuid'
775
915
  ]
916
+ bytes_cols = [
917
+ col
918
+ for col, typ in dtypes.items()
919
+ if typ == 'bytes'
920
+ ]
776
921
  datetime_cols = [
777
922
  col
778
923
  for col, typ in dtypes.items()
@@ -826,6 +971,17 @@ def enforce_dtypes(
826
971
  if debug:
827
972
  dprint(f"Unable to parse column '{col}' as UUID:\n{e}")
828
973
 
974
+ if bytes_cols:
975
+ if debug:
976
+ dprint(f"Checking for bytes: {bytes_cols}")
977
+ for col in bytes_cols:
978
+ if col in df.columns:
979
+ try:
980
+ df[col] = df[col].apply(attempt_cast_to_bytes)
981
+ except Exception as e:
982
+ if debug:
983
+ dprint(f"Unable to parse column '{col}' as bytes:\n{e}")
984
+
829
985
  if datetime_cols and coerce_timezone:
830
986
  if debug:
831
987
  dprint(f"Checking for datetime conversion: {datetime_cols}")
@@ -931,6 +1087,8 @@ def get_datetime_bound_from_df(
931
1087
  -------
932
1088
  The minimum or maximum datetime value in the dataframe, or `None`.
933
1089
  """
1090
+ from meerschaum.utils.dtypes import to_datetime, value_is_null
1091
+
934
1092
  if df is None:
935
1093
  return None
936
1094
  if not datetime_column:
@@ -982,9 +1140,9 @@ def get_datetime_bound_from_df(
982
1140
  dt_val = dt_val.compute()
983
1141
 
984
1142
  return (
985
- pandas.to_datetime(dt_val).to_pydatetime()
1143
+ to_datetime(dt_val, as_pydatetime=True)
986
1144
  if are_dtypes_equal(str(type(dt_val)), 'datetime')
987
- else (dt_val if dt_val is not pandas.NA else None)
1145
+ else (dt_val if not value_is_null(dt_val) else None)
988
1146
  )
989
1147
 
990
1148
  return None
@@ -1127,7 +1285,7 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
1127
1285
  for partition in ddf.partitions:
1128
1286
  try:
1129
1287
  pdf = partition.compute()
1130
- except Exception as e:
1288
+ except Exception:
1131
1289
  continue
1132
1290
  if len(pdf) > 0:
1133
1291
  return pdf
@@ -1408,12 +1566,16 @@ def to_json(
1408
1566
  A JSON string.
1409
1567
  """
1410
1568
  from meerschaum.utils.packages import import_pandas
1569
+ from meerschaum.utils.dtypes import serialize_bytes
1411
1570
  pd = import_pandas()
1412
1571
  uuid_cols = get_uuid_cols(df)
1413
- if uuid_cols and safe_copy:
1572
+ bytes_cols = get_bytes_cols(df)
1573
+ if safe_copy and bool(uuid_cols or bytes_cols):
1414
1574
  df = df.copy()
1415
1575
  for col in uuid_cols:
1416
1576
  df[col] = df[col].astype(str)
1577
+ for col in bytes_cols:
1578
+ df[col] = df[col].apply(serialize_bytes)
1417
1579
  return df.infer_objects(copy=False).fillna(pd.NA).to_json(
1418
1580
  date_format=date_format,
1419
1581
  date_unit=date_unit,
@@ -15,7 +15,19 @@ import meerschaum as mrsm
15
15
  from meerschaum.utils.typing import Dict, Union, Any
16
16
  from meerschaum.utils.warnings import warn
17
17
 
18
- MRSM_PD_DTYPES: Dict[str, str] = {
18
+ MRSM_ALIAS_DTYPES: Dict[str, str] = {
19
+ 'decimal': 'numeric',
20
+ 'number': 'numeric',
21
+ 'jsonl': 'json',
22
+ 'JSON': 'json',
23
+ 'binary': 'bytes',
24
+ 'blob': 'bytes',
25
+ 'varbinary': 'bytes',
26
+ 'bytea': 'bytes',
27
+ 'guid': 'uuid',
28
+ 'UUID': 'uuid',
29
+ }
30
+ MRSM_PD_DTYPES: Dict[Union[str, None], str] = {
19
31
  'json': 'object',
20
32
  'numeric': 'object',
21
33
  'uuid': 'object',
@@ -27,6 +39,8 @@ MRSM_PD_DTYPES: Dict[str, str] = {
27
39
  'int32': 'Int32',
28
40
  'int64': 'Int64',
29
41
  'str': 'string[python]',
42
+ 'bytes': 'object',
43
+ None: 'object',
30
44
  }
31
45
 
32
46
 
@@ -38,6 +52,10 @@ def to_pandas_dtype(dtype: str) -> str:
38
52
  if known_dtype is not None:
39
53
  return known_dtype
40
54
 
55
+ alias_dtype = MRSM_ALIAS_DTYPES.get(dtype, None)
56
+ if alias_dtype is not None:
57
+ return MRSM_PD_DTYPES[alias_dtype]
58
+
41
59
  ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
42
60
  ### treat it as a SQL db type.
43
61
  if dtype.split(' ')[0].isupper():
@@ -95,7 +113,7 @@ def are_dtypes_equal(
95
113
  try:
96
114
  if ldtype == rdtype:
97
115
  return True
98
- except Exception as e:
116
+ except Exception:
99
117
  warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
100
118
  return False
101
119
 
@@ -115,6 +133,10 @@ def are_dtypes_equal(
115
133
  if ldtype in uuid_dtypes and rdtype in uuid_dtypes:
116
134
  return True
117
135
 
136
+ bytes_dtypes = ('bytes', 'object')
137
+ if ldtype in bytes_dtypes and rdtype in bytes_dtypes:
138
+ return True
139
+
118
140
  ldtype_clean = ldtype.split('[', maxsplit=1)[0]
119
141
  rdtype_clean = rdtype.split('[', maxsplit=1)[0]
120
142
 
@@ -185,7 +207,7 @@ def attempt_cast_to_numeric(value: Any) -> Any:
185
207
  if not value_is_null(value)
186
208
  else Decimal('NaN')
187
209
  )
188
- except Exception as e:
210
+ except Exception:
189
211
  return value
190
212
 
191
213
 
@@ -201,7 +223,23 @@ def attempt_cast_to_uuid(value: Any) -> Any:
201
223
  if not value_is_null(value)
202
224
  else None
203
225
  )
204
- except Exception as e:
226
+ except Exception:
227
+ return value
228
+
229
+
230
+ def attempt_cast_to_bytes(value: Any) -> Any:
231
+ """
232
+ Given a value, attempt to coerce it into a bytestring.
233
+ """
234
+ if isinstance(value, bytes):
235
+ return value
236
+ try:
237
+ return (
238
+ deserialize_bytes_string(str(value))
239
+ if not value_is_null(value)
240
+ else None
241
+ )
242
+ except Exception:
205
243
  return value
206
244
 
207
245
 
@@ -251,7 +289,7 @@ def coerce_timezone(
251
289
  ) -> Any:
252
290
  """
253
291
  Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
254
- return a naive datetime in terms of UTC.
292
+ return a UTC timestamp (strip timezone if `strip_utc` is `True`.
255
293
  """
256
294
  if dt is None:
257
295
  return None
@@ -266,9 +304,7 @@ def coerce_timezone(
266
304
  dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__')
267
305
 
268
306
  if dt_is_series:
269
- is_dask = 'dask' in dt.__module__
270
307
  pandas = mrsm.attempt_import('pandas', lazy=False)
271
- dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
272
308
 
273
309
  if (
274
310
  pandas.api.types.is_datetime64_any_dtype(dt) and (
@@ -279,14 +315,13 @@ def coerce_timezone(
279
315
  ):
280
316
  return dt
281
317
 
282
- dt_series = (
283
- pandas.to_datetime(dt, utc=True, format='ISO8601')
284
- if dd is None
285
- else dd.to_datetime(dt, utc=True, format='ISO8601')
286
- )
318
+ dt_series = to_datetime(dt, coerce_utc=False)
287
319
  if strip_utc:
288
- if dt_series.dt.tz is not None:
289
- dt_series = dt_series.dt.tz_localize(None)
320
+ try:
321
+ if dt_series.dt.tz is not None:
322
+ dt_series = dt_series.dt.tz_localize(None)
323
+ except Exception:
324
+ pass
290
325
 
291
326
  return dt_series
292
327
 
@@ -299,3 +334,103 @@ def coerce_timezone(
299
334
  if strip_utc:
300
335
  return utc_dt.replace(tzinfo=None)
301
336
  return utc_dt
337
+
338
+
339
+ def to_datetime(dt_val: Any, as_pydatetime: bool = False, coerce_utc: bool = True) -> Any:
340
+ """
341
+ Wrap `pd.to_datetime()` and add support for out-of-bounds values.
342
+ """
343
+ pandas, dateutil_parser = mrsm.attempt_import('pandas', 'dateutil.parser', lazy=False)
344
+ is_dask = 'dask' in getattr(dt_val, '__module__', '')
345
+ dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
346
+ dt_is_series = hasattr(dt_val, 'dtype') and hasattr(dt_val, '__module__')
347
+ pd = pandas if dd is None else dd
348
+
349
+ try:
350
+ new_dt_val = pd.to_datetime(dt_val, utc=True, format='ISO8601')
351
+ if as_pydatetime:
352
+ return new_dt_val.to_pydatetime()
353
+ return new_dt_val
354
+ except (pd.errors.OutOfBoundsDatetime, ValueError):
355
+ pass
356
+
357
+ def parse(x: Any) -> Any:
358
+ try:
359
+ return dateutil_parser.parse(x)
360
+ except Exception:
361
+ return x
362
+
363
+ if dt_is_series:
364
+ new_series = dt_val.apply(parse)
365
+ if coerce_utc:
366
+ return coerce_timezone(new_series)
367
+ return new_series
368
+
369
+ new_dt_val = parse(dt_val)
370
+ if not coerce_utc:
371
+ return new_dt_val
372
+ return coerce_timezone(new_dt_val)
373
+
374
+
375
+ def serialize_bytes(data: bytes) -> str:
376
+ """
377
+ Return the given bytes as a base64-encoded string.
378
+ """
379
+ import base64
380
+ if not isinstance(data, bytes) and value_is_null(data):
381
+ return data
382
+ return base64.b64encode(data).decode('utf-8')
383
+
384
+
385
+ def deserialize_bytes_string(data: str | None, force_hex: bool = False) -> bytes | None:
386
+ """
387
+ Given a serialized ASCII string of bytes data, return the original bytes.
388
+ The input data may either be base64- or hex-encoded.
389
+
390
+ Parameters
391
+ ----------
392
+ data: str | None
393
+ The string to be deserialized into bytes.
394
+ May be base64- or hex-encoded (prefixed with `'\\x'`).
395
+
396
+ force_hex: bool = False
397
+ If `True`, treat the input string as hex-encoded.
398
+ If `data` does not begin with the prefix `'\\x'`, set `force_hex` to `True`.
399
+ This will still strip the leading `'\\x'` prefix if present.
400
+
401
+ Returns
402
+ -------
403
+ The original bytes used to produce the encoded string `data`.
404
+ """
405
+ if not isinstance(data, str) and value_is_null(data):
406
+ return data
407
+
408
+ import binascii
409
+ import base64
410
+
411
+ is_hex = force_hex or data.startswith('\\x')
412
+
413
+ if is_hex:
414
+ if data.startswith('\\x'):
415
+ data = data[2:]
416
+ return binascii.unhexlify(data)
417
+
418
+ return base64.b64decode(data)
419
+
420
+
421
+ def deserialize_base64(data: str) -> bytes:
422
+ """
423
+ Return the original bytestring from the given base64-encoded string.
424
+ """
425
+ import base64
426
+ return base64.b64decode(data)
427
+
428
+
429
+ def encode_bytes_for_bytea(data: bytes, with_prefix: bool = True) -> str | None:
430
+ """
431
+ Return the given bytes as a hex string for PostgreSQL's `BYTEA` type.
432
+ """
433
+ import binascii
434
+ if not isinstance(data, bytes) and value_is_null(data):
435
+ return data
436
+ return ('\\x' if with_prefix else '') + binascii.hexlify(data).decode('utf-8')