meerschaum 2.6.0.dev1__py3-none-any.whl → 2.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. meerschaum/api/dash/pages/login.py +17 -17
  2. meerschaum/api/dash/pipes.py +13 -4
  3. meerschaum/api/routes/_pipes.py +162 -136
  4. meerschaum/config/_version.py +1 -1
  5. meerschaum/config/static/__init__.py +1 -0
  6. meerschaum/connectors/api/_APIConnector.py +1 -0
  7. meerschaum/connectors/api/_pipes.py +46 -13
  8. meerschaum/connectors/sql/_SQLConnector.py +4 -3
  9. meerschaum/connectors/sql/_fetch.py +4 -2
  10. meerschaum/connectors/sql/_pipes.py +496 -147
  11. meerschaum/connectors/sql/_sql.py +37 -16
  12. meerschaum/connectors/valkey/_ValkeyConnector.py +3 -2
  13. meerschaum/connectors/valkey/_pipes.py +13 -5
  14. meerschaum/core/Pipe/__init__.py +20 -0
  15. meerschaum/core/Pipe/_attributes.py +179 -9
  16. meerschaum/core/Pipe/_clear.py +10 -8
  17. meerschaum/core/Pipe/_copy.py +2 -0
  18. meerschaum/core/Pipe/_data.py +57 -28
  19. meerschaum/core/Pipe/_deduplicate.py +30 -28
  20. meerschaum/core/Pipe/_dtypes.py +12 -2
  21. meerschaum/core/Pipe/_fetch.py +11 -9
  22. meerschaum/core/Pipe/_sync.py +24 -7
  23. meerschaum/core/Pipe/_verify.py +51 -48
  24. meerschaum/utils/dataframe.py +16 -8
  25. meerschaum/utils/dtypes/__init__.py +9 -1
  26. meerschaum/utils/dtypes/sql.py +32 -6
  27. meerschaum/utils/misc.py +8 -8
  28. meerschaum/utils/sql.py +485 -16
  29. {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.2.dist-info}/METADATA +1 -1
  30. {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.2.dist-info}/RECORD +36 -36
  31. {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.2.dist-info}/LICENSE +0 -0
  32. {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.2.dist-info}/NOTICE +0 -0
  33. {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.2.dist-info}/WHEEL +0 -0
  34. {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.2.dist-info}/entry_points.txt +0 -0
  35. {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.2.dist-info}/top_level.txt +0 -0
  36. {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.2.dist-info}/zip-safe +0 -0
@@ -120,26 +120,9 @@ def get_data(
120
120
  if isinstance(omit_columns, str):
121
121
  omit_columns = [omit_columns]
122
122
 
123
+ begin, end = self.parse_date_bounds(begin, end)
123
124
  as_iterator = as_iterator or as_chunks
124
125
  dt_col = self.columns.get('datetime', None)
125
- dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
126
- dt_is_utc = 'utc' in dt_typ.lower()
127
- if isinstance(begin, str):
128
- try:
129
- begin = dateutil_parser.parse(begin)
130
- except Exception as e:
131
- warn(f"Failed to parse '{begin}' as datetime:\n{e}")
132
- begin = None
133
- if isinstance(end, str):
134
- try:
135
- end = dateutil_parser.parse(end)
136
- except Exception as e:
137
- warn(f"Failed to parse '{end}' as datetime:\n{e}")
138
- end = None
139
- if isinstance(begin, datetime):
140
- begin = coerce_timezone(begin, strip_utc=(not dt_is_utc))
141
- if isinstance(end, datetime):
142
- end = coerce_timezone(end, strip_utc=(not dt_is_utc))
143
126
 
144
127
  def _sort_df(_df):
145
128
  if df_is_chunk_generator(_df):
@@ -330,16 +313,8 @@ def _get_data_as_iterator(
330
313
  Return a pipe's data as a generator.
331
314
  """
332
315
  from meerschaum.utils.misc import round_time
333
- parse_begin = isinstance(begin, str)
334
- parse_end = isinstance(end, str)
335
- if parse_begin or parse_end:
336
- from meerschaum.utils.packages import attempt_import
337
- dateutil_parser = attempt_import('dateutil.parser')
338
- if parse_begin:
339
- begin = dateutil_parser.parse(begin)
340
- if parse_end:
341
- end = dateutil_parser.parse(end)
342
-
316
+ from meerschaum.utils.dtypes import coerce_timezone
317
+ begin, end = self.parse_date_bounds(begin, end)
343
318
  if not self.exists(debug=debug):
344
319
  return
345
320
 
@@ -351,11 +326,15 @@ def _get_data_as_iterator(
351
326
  if begin is not None
352
327
  else self.get_sync_time(round_down=False, newest=False, params=params, debug=debug)
353
328
  ) if dt_col else None
329
+ if isinstance(min_dt, datetime):
330
+ min_dt = coerce_timezone(min_dt)
354
331
  max_dt = (
355
332
  end
356
333
  if end is not None
357
334
  else self.get_sync_time(round_down=False, newest=True, params=params, debug=debug)
358
335
  ) if dt_col else None
336
+ if isinstance(max_dt, datetime):
337
+ max_dt = coerce_timezone(max_dt)
359
338
 
360
339
  ### We want to search just past the maximum value.
361
340
  if end is None:
@@ -469,6 +448,8 @@ def get_backtrack_data(
469
448
  if not self.exists(debug=debug):
470
449
  return None
471
450
 
451
+ begin = self.parse_date_bounds(begin)
452
+
472
453
  backtrack_interval = self.get_backtrack_interval(debug=debug)
473
454
  if backtrack_minutes is None:
474
455
  backtrack_minutes = (
@@ -569,6 +550,7 @@ def get_rowcount(
569
550
  from meerschaum.utils.venv import Venv
570
551
  from meerschaum.connectors import get_connector_plugin
571
552
 
553
+ begin, end = self.parse_date_bounds(begin, end)
572
554
  connector = self.instance_connector if not remote else self.connector
573
555
  try:
574
556
  with Venv(get_connector_plugin(connector)):
@@ -683,6 +665,8 @@ def get_chunk_bounds(
683
665
  if begin is None and end is None:
684
666
  return [(None, None)]
685
667
 
668
+ begin, end = self.parse_date_bounds(begin, end)
669
+
686
670
  ### Set the chunk interval under `pipe.parameters['verify']['chunk_minutes']`.
687
671
  chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug)
688
672
 
@@ -714,3 +698,48 @@ def get_chunk_bounds(
714
698
  chunk_bounds = chunk_bounds + [(end, None)]
715
699
 
716
700
  return chunk_bounds
701
+
702
+
703
+ def parse_date_bounds(self, *dt_vals: Union[datetime, int, None]) -> Union[
704
+ datetime,
705
+ int,
706
+ str,
707
+ None,
708
+ Tuple[Union[datetime, int, str, None]]
709
+ ]:
710
+ """
711
+ Given a date bound (begin, end), coerce a timezone if necessary.
712
+ """
713
+ from meerschaum.utils.misc import is_int
714
+ from meerschaum.utils.dtypes import coerce_timezone
715
+ from meerschaum.utils.warnings import warn
716
+ dateutil_parser = mrsm.attempt_import('dateutil.parser')
717
+
718
+ def _parse_date_bound(dt_val):
719
+ if dt_val is None:
720
+ return None
721
+
722
+ if isinstance(dt_val, int):
723
+ return dt_val
724
+
725
+ if dt_val == '':
726
+ return ''
727
+
728
+ if is_int(dt_val):
729
+ return int(dt_val)
730
+
731
+ if isinstance(dt_val, str):
732
+ try:
733
+ dt_val = dateutil_parser.parse(dt_val)
734
+ except Exception as e:
735
+ warn(f"Could not parse '{dt_val}' as datetime:\n{e}")
736
+ return None
737
+
738
+ dt_col = self.columns.get('datetime', None)
739
+ dt_typ = str(self.dtypes.get(dt_col, 'datetime64[ns, UTC]'))
740
+ return coerce_timezone(dt_val, strip_utc=('utc' not in dt_typ.lower()))
741
+
742
+ bounds = tuple(_parse_date_bound(dt_val) for dt_val in dt_vals)
743
+ if len(bounds) == 1:
744
+ return bounds[0]
745
+ return bounds
@@ -65,14 +65,16 @@ def deduplicate(
65
65
  from meerschaum.connectors import get_connector_plugin
66
66
  from meerschaum.utils.pool import get_pool
67
67
 
68
+ begin, end = self.parse_date_bounds(begin, end)
69
+
68
70
  if self.cache_pipe is not None:
69
71
  success, msg = self.cache_pipe.deduplicate(
70
- begin = begin,
71
- end = end,
72
- params = params,
73
- bounded = bounded,
74
- debug = debug,
75
- _use_instance_method = _use_instance_method,
72
+ begin=begin,
73
+ end=end,
74
+ params=params,
75
+ bounded=bounded,
76
+ debug=debug,
77
+ _use_instance_method=_use_instance_method,
76
78
  **kwargs
77
79
  )
78
80
  if not success:
@@ -86,11 +88,11 @@ def deduplicate(
86
88
  if hasattr(self.instance_connector, 'deduplicate_pipe'):
87
89
  return self.instance_connector.deduplicate_pipe(
88
90
  self,
89
- begin = begin,
90
- end = end,
91
- params = params,
92
- bounded = bounded,
93
- debug = debug,
91
+ begin=begin,
92
+ end=end,
93
+ params=params,
94
+ bounded=bounded,
95
+ debug=debug,
94
96
  **kwargs
95
97
  )
96
98
 
@@ -117,33 +119,33 @@ def deduplicate(
117
119
  )
118
120
 
119
121
  chunk_bounds = self.get_chunk_bounds(
120
- bounded = bounded,
121
- begin = begin,
122
- end = end,
123
- chunk_interval = chunk_interval,
124
- debug = debug,
122
+ bounded=bounded,
123
+ begin=begin,
124
+ end=end,
125
+ chunk_interval=chunk_interval,
126
+ debug=debug,
125
127
  )
126
128
 
127
129
  indices = [col for col in self.columns.values() if col]
128
130
  if not indices:
129
- return False, f"Cannot deduplicate without index columns."
131
+ return False, "Cannot deduplicate without index columns."
130
132
  dt_col = self.columns.get('datetime', None)
131
133
 
132
134
  def process_chunk_bounds(bounds) -> Tuple[
133
- Tuple[
134
- Union[datetime, int, None],
135
- Union[datetime, int, None]
136
- ],
137
- SuccessTuple
138
- ]:
135
+ Tuple[
136
+ Union[datetime, int, None],
137
+ Union[datetime, int, None]
138
+ ],
139
+ SuccessTuple
140
+ ]:
139
141
  ### Only selecting the index values here to keep bandwidth down.
140
142
  chunk_begin, chunk_end = bounds
141
143
  chunk_df = self.get_data(
142
- select_columns = indices,
143
- begin = chunk_begin,
144
- end = chunk_end,
145
- params = params,
146
- debug = debug,
144
+ select_columns=indices,
145
+ begin=chunk_begin,
146
+ end=chunk_end,
147
+ params=params,
148
+ debug=debug,
147
149
  )
148
150
  if chunk_df is None:
149
151
  return bounds, (True, "")
@@ -30,6 +30,7 @@ def enforce_dtypes(
30
30
  from meerschaum.utils.warnings import warn
31
31
  from meerschaum.utils.debug import dprint
32
32
  from meerschaum.utils.dataframe import parse_df_datetimes, enforce_dtypes as _enforce_dtypes
33
+ from meerschaum.utils.dtypes import are_dtypes_equal
33
34
  from meerschaum.utils.packages import import_pandas
34
35
  pd = import_pandas(debug=debug)
35
36
  if df is None:
@@ -51,6 +52,7 @@ def enforce_dtypes(
51
52
  for col, dtype in pipe_dtypes.items()
52
53
  if 'datetime' not in str(dtype)
53
54
  ],
55
+ strip_timezone=(self.tzinfo is None),
54
56
  chunksize=chunksize,
55
57
  debug=debug,
56
58
  )
@@ -60,8 +62,9 @@ def enforce_dtypes(
60
62
  ignore_cols=[
61
63
  col
62
64
  for col, dtype in pipe_dtypes.items()
63
- if 'datetime' not in str(dtype)
65
+ if not are_dtypes_equal(str(dtype), 'datetime')
64
66
  ],
67
+ strip_timezone=(self.tzinfo is None),
65
68
  chunksize=chunksize,
66
69
  debug=debug,
67
70
  )
@@ -77,7 +80,14 @@ def enforce_dtypes(
77
80
  )
78
81
  return df
79
82
 
80
- return _enforce_dtypes(df, pipe_dtypes, safe_copy=safe_copy, debug=debug)
83
+ return _enforce_dtypes(
84
+ df,
85
+ pipe_dtypes,
86
+ safe_copy=safe_copy,
87
+ strip_timezone=(self.tzinfo is None),
88
+ coerce_timezone=True,
89
+ debug=debug,
90
+ )
81
91
 
82
92
 
83
93
  def infer_dtypes(self, persist: bool = False, debug: bool = False) -> Dict[str, Any]:
@@ -18,14 +18,14 @@ if TYPE_CHECKING:
18
18
  pd = mrsm.attempt_import('pandas')
19
19
 
20
20
  def fetch(
21
- self,
22
- begin: Union[datetime, str, None] = '',
23
- end: Optional[datetime] = None,
24
- check_existing: bool = True,
25
- sync_chunks: bool = False,
26
- debug: bool = False,
27
- **kw: Any
28
- ) -> Union['pd.DataFrame', Iterator['pd.DataFrame'], None]:
21
+ self,
22
+ begin: Union[datetime, int, str, None] = '',
23
+ end: Union[datetime, int, None] = None,
24
+ check_existing: bool = True,
25
+ sync_chunks: bool = False,
26
+ debug: bool = False,
27
+ **kw: Any
28
+ ) -> Union['pd.DataFrame', Iterator['pd.DataFrame'], None]:
29
29
  """
30
30
  Fetch a Pipe's latest data from its connector.
31
31
 
@@ -76,6 +76,8 @@ def fetch(
76
76
  chunk_message = '\n' + chunk_label + '\n' + chunk_message
77
77
  return chunk_success, chunk_message
78
78
 
79
+ begin, end = self.parse_date_bounds(begin, end)
80
+
79
81
  with mrsm.Venv(get_connector_plugin(self.connector)):
80
82
  _args, _kwargs = filter_arguments(
81
83
  self.connector.fetch,
@@ -164,6 +166,6 @@ def _determine_begin(
164
166
  backtrack_interval = timedelta(minutes=backtrack_interval)
165
167
  try:
166
168
  return sync_time - backtrack_interval
167
- except Exception as e:
169
+ except Exception:
168
170
  warn(f"Unable to substract backtrack interval {backtrack_interval} from {sync_time}.")
169
171
  return sync_time
@@ -141,6 +141,7 @@ def sync(
141
141
  chunksize = None
142
142
  sync_chunks = False
143
143
 
144
+ begin, end = self.parse_date_bounds(begin, end)
144
145
  kw.update({
145
146
  'begin': begin,
146
147
  'end': end,
@@ -460,7 +461,7 @@ def get_sync_time(
460
461
  apply_backtrack_interval: bool = False,
461
462
  round_down: bool = False,
462
463
  debug: bool = False
463
- ) -> Union['datetime', None]:
464
+ ) -> Union['datetime', int, None]:
464
465
  """
465
466
  Get the most recent datetime value for a Pipe.
466
467
 
@@ -485,7 +486,7 @@ def get_sync_time(
485
486
 
486
487
  Returns
487
488
  -------
488
- A `datetime` object if the pipe exists, otherwise `None`.
489
+ A `datetime` or int, if the pipe exists, otherwise `None`.
489
490
 
490
491
  """
491
492
  from meerschaum.utils.venv import Venv
@@ -510,13 +511,13 @@ def get_sync_time(
510
511
  except Exception as e:
511
512
  warn(f"Failed to apply backtrack interval:\n{e}")
512
513
 
513
- return sync_time
514
+ return self.parse_date_bounds(sync_time)
514
515
 
515
516
 
516
517
  def exists(
517
- self,
518
- debug : bool = False
519
- ) -> bool:
518
+ self,
519
+ debug: bool = False
520
+ ) -> bool:
520
521
  """
521
522
  See if a Pipe's table exists.
522
523
 
@@ -549,7 +550,11 @@ def exists(
549
550
  return _exists
550
551
 
551
552
  with Venv(get_connector_plugin(self.instance_connector)):
552
- _exists = self.instance_connector.pipe_exists(pipe=self, debug=debug)
553
+ _exists = (
554
+ self.instance_connector.pipe_exists(pipe=self, debug=debug)
555
+ if hasattr(self.instance_connector, 'pipe_exists')
556
+ else False
557
+ )
553
558
 
554
559
  self.__dict__['_exists'] = _exists
555
560
  self.__dict__['_exists_timestamp'] = now
@@ -928,7 +933,11 @@ def _persist_new_numeric_columns(self, df, debug: bool = False) -> SuccessTuple:
928
933
  if not new_numeric_cols:
929
934
  return True, "Success"
930
935
 
936
+ self._attributes_sync_time = None
937
+ dt_col = self.columns.get('datetime', None)
931
938
  dtypes = self.parameters.get('dtypes', {})
939
+ if dt_col not in dtypes:
940
+ dtypes[dt_col] = 'datetime'
932
941
  dtypes.update({col: 'numeric' for col in numeric_cols})
933
942
  self.parameters['dtypes'] = dtypes
934
943
  if not self.temporary:
@@ -952,7 +961,11 @@ def _persist_new_uuid_columns(self, df, debug: bool = False) -> SuccessTuple:
952
961
  if not new_uuid_cols:
953
962
  return True, "Success"
954
963
 
964
+ self._attributes_sync_time = None
965
+ dt_col = self.columns.get('datetime', None)
955
966
  dtypes = self.parameters.get('dtypes', {})
967
+ if dt_col not in dtypes:
968
+ dtypes[dt_col] = 'datetime'
956
969
  dtypes.update({col: 'uuid' for col in uuid_cols})
957
970
  self.parameters['dtypes'] = dtypes
958
971
  if not self.temporary:
@@ -976,7 +989,11 @@ def _persist_new_json_columns(self, df, debug: bool = False) -> SuccessTuple:
976
989
  if not new_json_cols:
977
990
  return True, "Success"
978
991
 
992
+ self._attributes_sync_time = None
993
+ dt_col = self.columns.get('datetime', None)
979
994
  dtypes = self.parameters.get('dtypes', {})
995
+ if dt_col not in dtypes:
996
+ dtypes[dt_col] = 'datetime'
980
997
  dtypes.update({col: 'json' for col in json_cols})
981
998
  self.parameters['dtypes'] = dtypes
982
999
 
@@ -11,6 +11,7 @@ from meerschaum.utils.typing import SuccessTuple, Any, Optional, Union, Tuple, L
11
11
  from meerschaum.utils.warnings import warn, info
12
12
  from meerschaum.utils.debug import dprint
13
13
 
14
+
14
15
  def verify(
15
16
  self,
16
17
  begin: Union[datetime, int, None] = None,
@@ -84,6 +85,8 @@ def verify(
84
85
  if bounded and end is None:
85
86
  end = self.get_sync_time(newest=True, debug=debug)
86
87
 
88
+ begin, end = self.parse_date_bounds(begin, end)
89
+
87
90
  if bounded and end is not None:
88
91
  end += (
89
92
  timedelta(minutes=1)
@@ -98,45 +101,45 @@ def verify(
98
101
 
99
102
  if cannot_determine_bounds:
100
103
  sync_success, sync_msg = self.sync(
101
- begin = begin,
102
- end = end,
103
- params = params,
104
- workers = workers,
105
- debug = debug,
104
+ begin=begin,
105
+ end=end,
106
+ params=params,
107
+ workers=workers,
108
+ debug=debug,
106
109
  **kwargs
107
110
  )
108
111
  if not sync_success:
109
112
  return sync_success, sync_msg
113
+
110
114
  if deduplicate:
111
115
  return self.deduplicate(
112
- begin = begin,
113
- end = end,
114
- params = params,
115
- workers = workers,
116
- debug = debug,
116
+ begin=begin,
117
+ end=end,
118
+ params=params,
119
+ workers=workers,
120
+ debug=debug,
117
121
  **kwargs
118
122
  )
119
123
  return sync_success, sync_msg
120
124
 
121
-
122
125
  chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug)
123
126
  chunk_bounds = self.get_chunk_bounds(
124
- begin = begin,
125
- end = end,
126
- chunk_interval = chunk_interval,
127
- bounded = bounded,
128
- debug = debug,
127
+ begin=begin,
128
+ end=end,
129
+ chunk_interval=chunk_interval,
130
+ bounded=bounded,
131
+ debug=debug,
129
132
  )
130
133
 
131
134
  ### Consider it a success if no chunks need to be verified.
132
135
  if not chunk_bounds:
133
136
  if deduplicate:
134
137
  return self.deduplicate(
135
- begin = begin,
136
- end = end,
137
- params = params,
138
- workers = workers,
139
- debug = debug,
138
+ begin=begin,
139
+ end=end,
140
+ params=params,
141
+ workers=workers,
142
+ debug=debug,
140
143
  **kwargs
141
144
  )
142
145
  return True, f"Could not determine chunks between '{begin}' and '{end}'; nothing to do."
@@ -175,21 +178,21 @@ def verify(
175
178
  ### }
176
179
  bounds_success_tuples = {}
177
180
  def process_chunk_bounds(
178
- chunk_begin_and_end: Tuple[
179
- Union[int, datetime],
180
- Union[int, datetime]
181
- ]
182
- ):
181
+ chunk_begin_and_end: Tuple[
182
+ Union[int, datetime],
183
+ Union[int, datetime]
184
+ ]
185
+ ):
183
186
  if chunk_begin_and_end in bounds_success_tuples:
184
187
  return chunk_begin_and_end, bounds_success_tuples[chunk_begin_and_end]
185
188
 
186
189
  chunk_begin, chunk_end = chunk_begin_and_end
187
190
  return chunk_begin_and_end, self.sync(
188
- begin = chunk_begin,
189
- end = chunk_end,
190
- params = params,
191
- workers = workers,
192
- debug = debug,
191
+ begin=chunk_begin,
192
+ end=chunk_end,
193
+ params=params,
194
+ workers=workers,
195
+ debug=debug,
193
196
  **kwargs
194
197
  )
195
198
 
@@ -216,11 +219,11 @@ def verify(
216
219
  msg = get_chunks_success_message(bounds_success_tuples, header=message_header)
217
220
  if deduplicate:
218
221
  deduplicate_success, deduplicate_msg = self.deduplicate(
219
- begin = begin,
220
- end = end,
221
- params = params,
222
- workers = workers,
223
- debug = debug,
222
+ begin=begin,
223
+ end=end,
224
+ params=params,
225
+ workers=workers,
226
+ debug=debug,
224
227
  **kwargs
225
228
  )
226
229
  return deduplicate_success, msg + '\n\n' + deduplicate_msg
@@ -239,7 +242,7 @@ def verify(
239
242
  warn(
240
243
  f"Will resync the following failed chunks:\n "
241
244
  + '\n '.join(bounds_to_print),
242
- stack = False,
245
+ stack=False,
243
246
  )
244
247
 
245
248
  retry_bounds_success_tuples = dict(pool.map(process_chunk_bounds, chunk_bounds_to_resync))
@@ -256,11 +259,11 @@ def verify(
256
259
  )
257
260
  if deduplicate:
258
261
  deduplicate_success, deduplicate_msg = self.deduplicate(
259
- begin = begin,
260
- end = end,
261
- params = params,
262
- workers = workers,
263
- debug = debug,
262
+ begin=begin,
263
+ end=end,
264
+ params=params,
265
+ workers=workers,
266
+ debug=debug,
264
267
  **kwargs
265
268
  )
266
269
  return deduplicate_success, message + '\n\n' + deduplicate_msg
@@ -269,11 +272,11 @@ def verify(
269
272
  message = get_chunks_success_message(bounds_success_tuples, header=message_header)
270
273
  if deduplicate:
271
274
  deduplicate_success, deduplicate_msg = self.deduplicate(
272
- begin = begin,
273
- end = end,
274
- params = params,
275
- workers = workers,
276
- debug = debug,
275
+ begin=begin,
276
+ end=end,
277
+ params=params,
278
+ workers=workers,
279
+ debug=debug,
277
280
  **kwargs
278
281
  )
279
282
  return deduplicate_success, message + '\n\n' + deduplicate_msg
@@ -417,7 +420,7 @@ def get_bound_time(self, debug: bool = False) -> Union[datetime, int, None]:
417
420
  -------
418
421
  A `datetime` or `int` corresponding to the
419
422
  `begin` bound for verification and deduplication syncs.
420
- """
423
+ """
421
424
  bound_interval = self.get_bound_interval(debug=debug)
422
425
  if bound_interval is None:
423
426
  return None
@@ -235,9 +235,9 @@ def filter_unseen_df(
235
235
  try:
236
236
  for col, typ in dt_dtypes.items():
237
237
  if col in old_df.columns:
238
- old_df[col] = coerce_timezone(pd.to_datetime(old_df[col], utc=True))
238
+ old_df[col] = coerce_timezone(old_df[col])
239
239
  if col in new_df.columns:
240
- new_df[col] = coerce_timezone(pd.to_datetime(new_df[col], utc=True))
240
+ new_df[col] = coerce_timezone(new_df[col])
241
241
  cast_dt_cols = False
242
242
  except Exception as e:
243
243
  warn(f"Could not cast datetime columns:\n{e}")
@@ -365,7 +365,7 @@ def filter_unseen_df(
365
365
  def parse_df_datetimes(
366
366
  df: 'pd.DataFrame',
367
367
  ignore_cols: Optional[Iterable[str]] = None,
368
- strip_timezone: bool = True,
368
+ strip_timezone: bool = False,
369
369
  chunksize: Optional[int] = None,
370
370
  dtype_backend: str = 'numpy_nullable',
371
371
  debug: bool = False,
@@ -381,7 +381,7 @@ def parse_df_datetimes(
381
381
  ignore_cols: Optional[Iterable[str]], default None
382
382
  If provided, do not attempt to coerce these columns as datetimes.
383
383
 
384
- strip_timezone: bool, default True
384
+ strip_timezone: bool, default False
385
385
  If `True`, remove the UTC `tzinfo` property.
386
386
 
387
387
  chunksize: Optional[int], default None
@@ -486,7 +486,7 @@ def parse_df_datetimes(
486
486
  if len(cols_to_inspect) == 0:
487
487
  if debug:
488
488
  dprint(f"All columns are ignored, skipping datetime detection...")
489
- return df
489
+ return df.fillna(pandas.NA)
490
490
 
491
491
  ### apply regex to columns to determine which are ISO datetimes
492
492
  iso_dt_regex = r'\d{4}-\d{2}-\d{2}.\d{2}\:\d{2}\:\d+'
@@ -499,7 +499,7 @@ def parse_df_datetimes(
499
499
  if not datetime_cols:
500
500
  if debug:
501
501
  dprint("No columns detected as datetimes, returning...")
502
- return df
502
+ return df.fillna(pandas.NA)
503
503
 
504
504
  if debug:
505
505
  dprint("Converting columns to datetimes: " + str(datetime_cols))
@@ -537,7 +537,7 @@ def parse_df_datetimes(
537
537
  + f"{traceback.format_exc()}"
538
538
  )
539
539
 
540
- return df
540
+ return df.fillna(pandas.NA)
541
541
 
542
542
 
543
543
  def get_unhashable_cols(df: 'pd.DataFrame') -> List[str]:
@@ -689,6 +689,7 @@ def enforce_dtypes(
689
689
  safe_copy: bool = True,
690
690
  coerce_numeric: bool = True,
691
691
  coerce_timezone: bool = True,
692
+ strip_timezone: bool = False,
692
693
  debug: bool = False,
693
694
  ) -> 'pd.DataFrame':
694
695
  """
@@ -713,6 +714,10 @@ def enforce_dtypes(
713
714
  coerce_timezone: bool, default True
714
715
  If `True`, convert datetimes to UTC.
715
716
 
717
+ strip_timezone: bool, default False
718
+ If `coerce_timezone` and `strip_timezone` are `True`,
719
+ remove timezone information from datetimes.
720
+
716
721
  debug: bool, default False
717
722
  Verbosity toggle.
718
723
 
@@ -731,6 +736,8 @@ def enforce_dtypes(
731
736
  attempt_cast_to_uuid,
732
737
  coerce_timezone as _coerce_timezone,
733
738
  )
739
+ pandas = mrsm.attempt_import('pandas')
740
+ is_dask = 'dask' in df.__module__
734
741
  if safe_copy:
735
742
  df = df.copy()
736
743
  if len(df.columns) == 0:
@@ -814,7 +821,8 @@ def enforce_dtypes(
814
821
  if debug:
815
822
  dprint(f"Checking for datetime conversion: {datetime_cols}")
816
823
  for col in datetime_cols:
817
- df[col] = _coerce_timezone(df[col])
824
+ if col in df.columns:
825
+ df[col] = _coerce_timezone(df[col], strip_utc=strip_timezone)
818
826
 
819
827
  df_dtypes = {c: str(t) for c, t in df.dtypes.items()}
820
828
  if are_dtypes_equal(df_dtypes, pipe_pandas_dtypes):