meerschaum 2.6.0.dev1__py3-none-any.whl → 2.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/api/dash/pages/login.py +17 -17
- meerschaum/api/dash/pipes.py +13 -4
- meerschaum/api/routes/_pipes.py +162 -136
- meerschaum/config/_version.py +1 -1
- meerschaum/config/static/__init__.py +1 -0
- meerschaum/connectors/api/_APIConnector.py +1 -0
- meerschaum/connectors/api/_pipes.py +46 -13
- meerschaum/connectors/sql/_SQLConnector.py +4 -3
- meerschaum/connectors/sql/_fetch.py +4 -2
- meerschaum/connectors/sql/_pipes.py +496 -148
- meerschaum/connectors/sql/_sql.py +37 -16
- meerschaum/connectors/valkey/_ValkeyConnector.py +3 -2
- meerschaum/connectors/valkey/_pipes.py +13 -5
- meerschaum/core/Pipe/__init__.py +20 -0
- meerschaum/core/Pipe/_attributes.py +179 -9
- meerschaum/core/Pipe/_clear.py +10 -8
- meerschaum/core/Pipe/_copy.py +2 -0
- meerschaum/core/Pipe/_data.py +57 -28
- meerschaum/core/Pipe/_deduplicate.py +30 -28
- meerschaum/core/Pipe/_dtypes.py +12 -2
- meerschaum/core/Pipe/_fetch.py +11 -9
- meerschaum/core/Pipe/_sync.py +24 -7
- meerschaum/core/Pipe/_verify.py +51 -48
- meerschaum/utils/dataframe.py +16 -8
- meerschaum/utils/dtypes/__init__.py +9 -1
- meerschaum/utils/dtypes/sql.py +32 -6
- meerschaum/utils/misc.py +8 -8
- meerschaum/utils/sql.py +485 -16
- {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.1.dist-info}/METADATA +1 -1
- {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.1.dist-info}/RECORD +36 -36
- {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.1.dist-info}/LICENSE +0 -0
- {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.1.dist-info}/NOTICE +0 -0
- {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.1.dist-info}/WHEEL +0 -0
- {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.1.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.1.dist-info}/top_level.txt +0 -0
- {meerschaum-2.6.0.dev1.dist-info → meerschaum-2.6.1.dist-info}/zip-safe +0 -0
meerschaum/core/Pipe/_data.py
CHANGED
@@ -120,26 +120,9 @@ def get_data(
|
|
120
120
|
if isinstance(omit_columns, str):
|
121
121
|
omit_columns = [omit_columns]
|
122
122
|
|
123
|
+
begin, end = self.parse_date_bounds(begin, end)
|
123
124
|
as_iterator = as_iterator or as_chunks
|
124
125
|
dt_col = self.columns.get('datetime', None)
|
125
|
-
dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
126
|
-
dt_is_utc = 'utc' in dt_typ.lower()
|
127
|
-
if isinstance(begin, str):
|
128
|
-
try:
|
129
|
-
begin = dateutil_parser.parse(begin)
|
130
|
-
except Exception as e:
|
131
|
-
warn(f"Failed to parse '{begin}' as datetime:\n{e}")
|
132
|
-
begin = None
|
133
|
-
if isinstance(end, str):
|
134
|
-
try:
|
135
|
-
end = dateutil_parser.parse(end)
|
136
|
-
except Exception as e:
|
137
|
-
warn(f"Failed to parse '{end}' as datetime:\n{e}")
|
138
|
-
end = None
|
139
|
-
if isinstance(begin, datetime):
|
140
|
-
begin = coerce_timezone(begin, strip_utc=(not dt_is_utc))
|
141
|
-
if isinstance(end, datetime):
|
142
|
-
end = coerce_timezone(end, strip_utc=(not dt_is_utc))
|
143
126
|
|
144
127
|
def _sort_df(_df):
|
145
128
|
if df_is_chunk_generator(_df):
|
@@ -330,16 +313,8 @@ def _get_data_as_iterator(
|
|
330
313
|
Return a pipe's data as a generator.
|
331
314
|
"""
|
332
315
|
from meerschaum.utils.misc import round_time
|
333
|
-
|
334
|
-
|
335
|
-
if parse_begin or parse_end:
|
336
|
-
from meerschaum.utils.packages import attempt_import
|
337
|
-
dateutil_parser = attempt_import('dateutil.parser')
|
338
|
-
if parse_begin:
|
339
|
-
begin = dateutil_parser.parse(begin)
|
340
|
-
if parse_end:
|
341
|
-
end = dateutil_parser.parse(end)
|
342
|
-
|
316
|
+
from meerschaum.utils.dtypes import coerce_timezone
|
317
|
+
begin, end = self.parse_date_bounds(begin, end)
|
343
318
|
if not self.exists(debug=debug):
|
344
319
|
return
|
345
320
|
|
@@ -351,11 +326,15 @@ def _get_data_as_iterator(
|
|
351
326
|
if begin is not None
|
352
327
|
else self.get_sync_time(round_down=False, newest=False, params=params, debug=debug)
|
353
328
|
) if dt_col else None
|
329
|
+
if isinstance(min_dt, datetime):
|
330
|
+
min_dt = coerce_timezone(min_dt)
|
354
331
|
max_dt = (
|
355
332
|
end
|
356
333
|
if end is not None
|
357
334
|
else self.get_sync_time(round_down=False, newest=True, params=params, debug=debug)
|
358
335
|
) if dt_col else None
|
336
|
+
if isinstance(max_dt, datetime):
|
337
|
+
max_dt = coerce_timezone(max_dt)
|
359
338
|
|
360
339
|
### We want to search just past the maximum value.
|
361
340
|
if end is None:
|
@@ -469,6 +448,8 @@ def get_backtrack_data(
|
|
469
448
|
if not self.exists(debug=debug):
|
470
449
|
return None
|
471
450
|
|
451
|
+
begin = self.parse_date_bounds(begin)
|
452
|
+
|
472
453
|
backtrack_interval = self.get_backtrack_interval(debug=debug)
|
473
454
|
if backtrack_minutes is None:
|
474
455
|
backtrack_minutes = (
|
@@ -569,6 +550,7 @@ def get_rowcount(
|
|
569
550
|
from meerschaum.utils.venv import Venv
|
570
551
|
from meerschaum.connectors import get_connector_plugin
|
571
552
|
|
553
|
+
begin, end = self.parse_date_bounds(begin, end)
|
572
554
|
connector = self.instance_connector if not remote else self.connector
|
573
555
|
try:
|
574
556
|
with Venv(get_connector_plugin(connector)):
|
@@ -683,6 +665,8 @@ def get_chunk_bounds(
|
|
683
665
|
if begin is None and end is None:
|
684
666
|
return [(None, None)]
|
685
667
|
|
668
|
+
begin, end = self.parse_date_bounds(begin, end)
|
669
|
+
|
686
670
|
### Set the chunk interval under `pipe.parameters['verify']['chunk_minutes']`.
|
687
671
|
chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug)
|
688
672
|
|
@@ -714,3 +698,48 @@ def get_chunk_bounds(
|
|
714
698
|
chunk_bounds = chunk_bounds + [(end, None)]
|
715
699
|
|
716
700
|
return chunk_bounds
|
701
|
+
|
702
|
+
|
703
|
+
def parse_date_bounds(self, *dt_vals: Union[datetime, int, None]) -> Union[
|
704
|
+
datetime,
|
705
|
+
int,
|
706
|
+
str,
|
707
|
+
None,
|
708
|
+
Tuple[Union[datetime, int, str, None]]
|
709
|
+
]:
|
710
|
+
"""
|
711
|
+
Given a date bound (begin, end), coerce a timezone if necessary.
|
712
|
+
"""
|
713
|
+
from meerschaum.utils.misc import is_int
|
714
|
+
from meerschaum.utils.dtypes import coerce_timezone
|
715
|
+
from meerschaum.utils.warnings import warn
|
716
|
+
dateutil_parser = mrsm.attempt_import('dateutil.parser')
|
717
|
+
|
718
|
+
def _parse_date_bound(dt_val):
|
719
|
+
if dt_val is None:
|
720
|
+
return None
|
721
|
+
|
722
|
+
if isinstance(dt_val, int):
|
723
|
+
return dt_val
|
724
|
+
|
725
|
+
if dt_val == '':
|
726
|
+
return ''
|
727
|
+
|
728
|
+
if is_int(dt_val):
|
729
|
+
return int(dt_val)
|
730
|
+
|
731
|
+
if isinstance(dt_val, str):
|
732
|
+
try:
|
733
|
+
dt_val = dateutil_parser.parse(dt_val)
|
734
|
+
except Exception as e:
|
735
|
+
warn(f"Could not parse '{dt_val}' as datetime:\n{e}")
|
736
|
+
return None
|
737
|
+
|
738
|
+
dt_col = self.columns.get('datetime', None)
|
739
|
+
dt_typ = str(self.dtypes.get(dt_col, 'datetime64[ns, UTC]'))
|
740
|
+
return coerce_timezone(dt_val, strip_utc=('utc' not in dt_typ.lower()))
|
741
|
+
|
742
|
+
bounds = tuple(_parse_date_bound(dt_val) for dt_val in dt_vals)
|
743
|
+
if len(bounds) == 1:
|
744
|
+
return bounds[0]
|
745
|
+
return bounds
|
@@ -65,14 +65,16 @@ def deduplicate(
|
|
65
65
|
from meerschaum.connectors import get_connector_plugin
|
66
66
|
from meerschaum.utils.pool import get_pool
|
67
67
|
|
68
|
+
begin, end = self.parse_date_bounds(begin, end)
|
69
|
+
|
68
70
|
if self.cache_pipe is not None:
|
69
71
|
success, msg = self.cache_pipe.deduplicate(
|
70
|
-
begin
|
71
|
-
end
|
72
|
-
params
|
73
|
-
bounded
|
74
|
-
debug
|
75
|
-
_use_instance_method
|
72
|
+
begin=begin,
|
73
|
+
end=end,
|
74
|
+
params=params,
|
75
|
+
bounded=bounded,
|
76
|
+
debug=debug,
|
77
|
+
_use_instance_method=_use_instance_method,
|
76
78
|
**kwargs
|
77
79
|
)
|
78
80
|
if not success:
|
@@ -86,11 +88,11 @@ def deduplicate(
|
|
86
88
|
if hasattr(self.instance_connector, 'deduplicate_pipe'):
|
87
89
|
return self.instance_connector.deduplicate_pipe(
|
88
90
|
self,
|
89
|
-
begin
|
90
|
-
end
|
91
|
-
params
|
92
|
-
bounded
|
93
|
-
debug
|
91
|
+
begin=begin,
|
92
|
+
end=end,
|
93
|
+
params=params,
|
94
|
+
bounded=bounded,
|
95
|
+
debug=debug,
|
94
96
|
**kwargs
|
95
97
|
)
|
96
98
|
|
@@ -117,33 +119,33 @@ def deduplicate(
|
|
117
119
|
)
|
118
120
|
|
119
121
|
chunk_bounds = self.get_chunk_bounds(
|
120
|
-
bounded
|
121
|
-
begin
|
122
|
-
end
|
123
|
-
chunk_interval
|
124
|
-
debug
|
122
|
+
bounded=bounded,
|
123
|
+
begin=begin,
|
124
|
+
end=end,
|
125
|
+
chunk_interval=chunk_interval,
|
126
|
+
debug=debug,
|
125
127
|
)
|
126
128
|
|
127
129
|
indices = [col for col in self.columns.values() if col]
|
128
130
|
if not indices:
|
129
|
-
return False,
|
131
|
+
return False, "Cannot deduplicate without index columns."
|
130
132
|
dt_col = self.columns.get('datetime', None)
|
131
133
|
|
132
134
|
def process_chunk_bounds(bounds) -> Tuple[
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
135
|
+
Tuple[
|
136
|
+
Union[datetime, int, None],
|
137
|
+
Union[datetime, int, None]
|
138
|
+
],
|
139
|
+
SuccessTuple
|
140
|
+
]:
|
139
141
|
### Only selecting the index values here to keep bandwidth down.
|
140
142
|
chunk_begin, chunk_end = bounds
|
141
143
|
chunk_df = self.get_data(
|
142
|
-
select_columns
|
143
|
-
begin
|
144
|
-
end
|
145
|
-
params
|
146
|
-
debug
|
144
|
+
select_columns=indices,
|
145
|
+
begin=chunk_begin,
|
146
|
+
end=chunk_end,
|
147
|
+
params=params,
|
148
|
+
debug=debug,
|
147
149
|
)
|
148
150
|
if chunk_df is None:
|
149
151
|
return bounds, (True, "")
|
meerschaum/core/Pipe/_dtypes.py
CHANGED
@@ -30,6 +30,7 @@ def enforce_dtypes(
|
|
30
30
|
from meerschaum.utils.warnings import warn
|
31
31
|
from meerschaum.utils.debug import dprint
|
32
32
|
from meerschaum.utils.dataframe import parse_df_datetimes, enforce_dtypes as _enforce_dtypes
|
33
|
+
from meerschaum.utils.dtypes import are_dtypes_equal
|
33
34
|
from meerschaum.utils.packages import import_pandas
|
34
35
|
pd = import_pandas(debug=debug)
|
35
36
|
if df is None:
|
@@ -51,6 +52,7 @@ def enforce_dtypes(
|
|
51
52
|
for col, dtype in pipe_dtypes.items()
|
52
53
|
if 'datetime' not in str(dtype)
|
53
54
|
],
|
55
|
+
strip_timezone=(self.tzinfo is None),
|
54
56
|
chunksize=chunksize,
|
55
57
|
debug=debug,
|
56
58
|
)
|
@@ -60,8 +62,9 @@ def enforce_dtypes(
|
|
60
62
|
ignore_cols=[
|
61
63
|
col
|
62
64
|
for col, dtype in pipe_dtypes.items()
|
63
|
-
if
|
65
|
+
if not are_dtypes_equal(str(dtype), 'datetime')
|
64
66
|
],
|
67
|
+
strip_timezone=(self.tzinfo is None),
|
65
68
|
chunksize=chunksize,
|
66
69
|
debug=debug,
|
67
70
|
)
|
@@ -77,7 +80,14 @@ def enforce_dtypes(
|
|
77
80
|
)
|
78
81
|
return df
|
79
82
|
|
80
|
-
return _enforce_dtypes(
|
83
|
+
return _enforce_dtypes(
|
84
|
+
df,
|
85
|
+
pipe_dtypes,
|
86
|
+
safe_copy=safe_copy,
|
87
|
+
strip_timezone=(self.tzinfo is None),
|
88
|
+
coerce_timezone=True,
|
89
|
+
debug=debug,
|
90
|
+
)
|
81
91
|
|
82
92
|
|
83
93
|
def infer_dtypes(self, persist: bool = False, debug: bool = False) -> Dict[str, Any]:
|
meerschaum/core/Pipe/_fetch.py
CHANGED
@@ -18,14 +18,14 @@ if TYPE_CHECKING:
|
|
18
18
|
pd = mrsm.attempt_import('pandas')
|
19
19
|
|
20
20
|
def fetch(
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
21
|
+
self,
|
22
|
+
begin: Union[datetime, int, str, None] = '',
|
23
|
+
end: Union[datetime, int, None] = None,
|
24
|
+
check_existing: bool = True,
|
25
|
+
sync_chunks: bool = False,
|
26
|
+
debug: bool = False,
|
27
|
+
**kw: Any
|
28
|
+
) -> Union['pd.DataFrame', Iterator['pd.DataFrame'], None]:
|
29
29
|
"""
|
30
30
|
Fetch a Pipe's latest data from its connector.
|
31
31
|
|
@@ -76,6 +76,8 @@ def fetch(
|
|
76
76
|
chunk_message = '\n' + chunk_label + '\n' + chunk_message
|
77
77
|
return chunk_success, chunk_message
|
78
78
|
|
79
|
+
begin, end = self.parse_date_bounds(begin, end)
|
80
|
+
|
79
81
|
with mrsm.Venv(get_connector_plugin(self.connector)):
|
80
82
|
_args, _kwargs = filter_arguments(
|
81
83
|
self.connector.fetch,
|
@@ -164,6 +166,6 @@ def _determine_begin(
|
|
164
166
|
backtrack_interval = timedelta(minutes=backtrack_interval)
|
165
167
|
try:
|
166
168
|
return sync_time - backtrack_interval
|
167
|
-
except Exception
|
169
|
+
except Exception:
|
168
170
|
warn(f"Unable to substract backtrack interval {backtrack_interval} from {sync_time}.")
|
169
171
|
return sync_time
|
meerschaum/core/Pipe/_sync.py
CHANGED
@@ -141,6 +141,7 @@ def sync(
|
|
141
141
|
chunksize = None
|
142
142
|
sync_chunks = False
|
143
143
|
|
144
|
+
begin, end = self.parse_date_bounds(begin, end)
|
144
145
|
kw.update({
|
145
146
|
'begin': begin,
|
146
147
|
'end': end,
|
@@ -460,7 +461,7 @@ def get_sync_time(
|
|
460
461
|
apply_backtrack_interval: bool = False,
|
461
462
|
round_down: bool = False,
|
462
463
|
debug: bool = False
|
463
|
-
) -> Union['datetime', None]:
|
464
|
+
) -> Union['datetime', int, None]:
|
464
465
|
"""
|
465
466
|
Get the most recent datetime value for a Pipe.
|
466
467
|
|
@@ -485,7 +486,7 @@ def get_sync_time(
|
|
485
486
|
|
486
487
|
Returns
|
487
488
|
-------
|
488
|
-
A `datetime`
|
489
|
+
A `datetime` or int, if the pipe exists, otherwise `None`.
|
489
490
|
|
490
491
|
"""
|
491
492
|
from meerschaum.utils.venv import Venv
|
@@ -510,13 +511,13 @@ def get_sync_time(
|
|
510
511
|
except Exception as e:
|
511
512
|
warn(f"Failed to apply backtrack interval:\n{e}")
|
512
513
|
|
513
|
-
return sync_time
|
514
|
+
return self.parse_date_bounds(sync_time)
|
514
515
|
|
515
516
|
|
516
517
|
def exists(
|
517
|
-
|
518
|
-
|
519
|
-
|
518
|
+
self,
|
519
|
+
debug: bool = False
|
520
|
+
) -> bool:
|
520
521
|
"""
|
521
522
|
See if a Pipe's table exists.
|
522
523
|
|
@@ -549,7 +550,11 @@ def exists(
|
|
549
550
|
return _exists
|
550
551
|
|
551
552
|
with Venv(get_connector_plugin(self.instance_connector)):
|
552
|
-
_exists =
|
553
|
+
_exists = (
|
554
|
+
self.instance_connector.pipe_exists(pipe=self, debug=debug)
|
555
|
+
if hasattr(self.instance_connector, 'pipe_exists')
|
556
|
+
else False
|
557
|
+
)
|
553
558
|
|
554
559
|
self.__dict__['_exists'] = _exists
|
555
560
|
self.__dict__['_exists_timestamp'] = now
|
@@ -928,7 +933,11 @@ def _persist_new_numeric_columns(self, df, debug: bool = False) -> SuccessTuple:
|
|
928
933
|
if not new_numeric_cols:
|
929
934
|
return True, "Success"
|
930
935
|
|
936
|
+
self._attributes_sync_time = None
|
937
|
+
dt_col = self.columns.get('datetime', None)
|
931
938
|
dtypes = self.parameters.get('dtypes', {})
|
939
|
+
if dt_col not in dtypes:
|
940
|
+
dtypes[dt_col] = 'datetime'
|
932
941
|
dtypes.update({col: 'numeric' for col in numeric_cols})
|
933
942
|
self.parameters['dtypes'] = dtypes
|
934
943
|
if not self.temporary:
|
@@ -952,7 +961,11 @@ def _persist_new_uuid_columns(self, df, debug: bool = False) -> SuccessTuple:
|
|
952
961
|
if not new_uuid_cols:
|
953
962
|
return True, "Success"
|
954
963
|
|
964
|
+
self._attributes_sync_time = None
|
965
|
+
dt_col = self.columns.get('datetime', None)
|
955
966
|
dtypes = self.parameters.get('dtypes', {})
|
967
|
+
if dt_col not in dtypes:
|
968
|
+
dtypes[dt_col] = 'datetime'
|
956
969
|
dtypes.update({col: 'uuid' for col in uuid_cols})
|
957
970
|
self.parameters['dtypes'] = dtypes
|
958
971
|
if not self.temporary:
|
@@ -976,7 +989,11 @@ def _persist_new_json_columns(self, df, debug: bool = False) -> SuccessTuple:
|
|
976
989
|
if not new_json_cols:
|
977
990
|
return True, "Success"
|
978
991
|
|
992
|
+
self._attributes_sync_time = None
|
993
|
+
dt_col = self.columns.get('datetime', None)
|
979
994
|
dtypes = self.parameters.get('dtypes', {})
|
995
|
+
if dt_col not in dtypes:
|
996
|
+
dtypes[dt_col] = 'datetime'
|
980
997
|
dtypes.update({col: 'json' for col in json_cols})
|
981
998
|
self.parameters['dtypes'] = dtypes
|
982
999
|
|
meerschaum/core/Pipe/_verify.py
CHANGED
@@ -11,6 +11,7 @@ from meerschaum.utils.typing import SuccessTuple, Any, Optional, Union, Tuple, L
|
|
11
11
|
from meerschaum.utils.warnings import warn, info
|
12
12
|
from meerschaum.utils.debug import dprint
|
13
13
|
|
14
|
+
|
14
15
|
def verify(
|
15
16
|
self,
|
16
17
|
begin: Union[datetime, int, None] = None,
|
@@ -84,6 +85,8 @@ def verify(
|
|
84
85
|
if bounded and end is None:
|
85
86
|
end = self.get_sync_time(newest=True, debug=debug)
|
86
87
|
|
88
|
+
begin, end = self.parse_date_bounds(begin, end)
|
89
|
+
|
87
90
|
if bounded and end is not None:
|
88
91
|
end += (
|
89
92
|
timedelta(minutes=1)
|
@@ -98,45 +101,45 @@ def verify(
|
|
98
101
|
|
99
102
|
if cannot_determine_bounds:
|
100
103
|
sync_success, sync_msg = self.sync(
|
101
|
-
begin
|
102
|
-
end
|
103
|
-
params
|
104
|
-
workers
|
105
|
-
debug
|
104
|
+
begin=begin,
|
105
|
+
end=end,
|
106
|
+
params=params,
|
107
|
+
workers=workers,
|
108
|
+
debug=debug,
|
106
109
|
**kwargs
|
107
110
|
)
|
108
111
|
if not sync_success:
|
109
112
|
return sync_success, sync_msg
|
113
|
+
|
110
114
|
if deduplicate:
|
111
115
|
return self.deduplicate(
|
112
|
-
begin
|
113
|
-
end
|
114
|
-
params
|
115
|
-
workers
|
116
|
-
debug
|
116
|
+
begin=begin,
|
117
|
+
end=end,
|
118
|
+
params=params,
|
119
|
+
workers=workers,
|
120
|
+
debug=debug,
|
117
121
|
**kwargs
|
118
122
|
)
|
119
123
|
return sync_success, sync_msg
|
120
124
|
|
121
|
-
|
122
125
|
chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug)
|
123
126
|
chunk_bounds = self.get_chunk_bounds(
|
124
|
-
begin
|
125
|
-
end
|
126
|
-
chunk_interval
|
127
|
-
bounded
|
128
|
-
debug
|
127
|
+
begin=begin,
|
128
|
+
end=end,
|
129
|
+
chunk_interval=chunk_interval,
|
130
|
+
bounded=bounded,
|
131
|
+
debug=debug,
|
129
132
|
)
|
130
133
|
|
131
134
|
### Consider it a success if no chunks need to be verified.
|
132
135
|
if not chunk_bounds:
|
133
136
|
if deduplicate:
|
134
137
|
return self.deduplicate(
|
135
|
-
begin
|
136
|
-
end
|
137
|
-
params
|
138
|
-
workers
|
139
|
-
debug
|
138
|
+
begin=begin,
|
139
|
+
end=end,
|
140
|
+
params=params,
|
141
|
+
workers=workers,
|
142
|
+
debug=debug,
|
140
143
|
**kwargs
|
141
144
|
)
|
142
145
|
return True, f"Could not determine chunks between '{begin}' and '{end}'; nothing to do."
|
@@ -175,21 +178,21 @@ def verify(
|
|
175
178
|
### }
|
176
179
|
bounds_success_tuples = {}
|
177
180
|
def process_chunk_bounds(
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
181
|
+
chunk_begin_and_end: Tuple[
|
182
|
+
Union[int, datetime],
|
183
|
+
Union[int, datetime]
|
184
|
+
]
|
185
|
+
):
|
183
186
|
if chunk_begin_and_end in bounds_success_tuples:
|
184
187
|
return chunk_begin_and_end, bounds_success_tuples[chunk_begin_and_end]
|
185
188
|
|
186
189
|
chunk_begin, chunk_end = chunk_begin_and_end
|
187
190
|
return chunk_begin_and_end, self.sync(
|
188
|
-
begin
|
189
|
-
end
|
190
|
-
params
|
191
|
-
workers
|
192
|
-
debug
|
191
|
+
begin=chunk_begin,
|
192
|
+
end=chunk_end,
|
193
|
+
params=params,
|
194
|
+
workers=workers,
|
195
|
+
debug=debug,
|
193
196
|
**kwargs
|
194
197
|
)
|
195
198
|
|
@@ -216,11 +219,11 @@ def verify(
|
|
216
219
|
msg = get_chunks_success_message(bounds_success_tuples, header=message_header)
|
217
220
|
if deduplicate:
|
218
221
|
deduplicate_success, deduplicate_msg = self.deduplicate(
|
219
|
-
begin
|
220
|
-
end
|
221
|
-
params
|
222
|
-
workers
|
223
|
-
debug
|
222
|
+
begin=begin,
|
223
|
+
end=end,
|
224
|
+
params=params,
|
225
|
+
workers=workers,
|
226
|
+
debug=debug,
|
224
227
|
**kwargs
|
225
228
|
)
|
226
229
|
return deduplicate_success, msg + '\n\n' + deduplicate_msg
|
@@ -239,7 +242,7 @@ def verify(
|
|
239
242
|
warn(
|
240
243
|
f"Will resync the following failed chunks:\n "
|
241
244
|
+ '\n '.join(bounds_to_print),
|
242
|
-
stack
|
245
|
+
stack=False,
|
243
246
|
)
|
244
247
|
|
245
248
|
retry_bounds_success_tuples = dict(pool.map(process_chunk_bounds, chunk_bounds_to_resync))
|
@@ -256,11 +259,11 @@ def verify(
|
|
256
259
|
)
|
257
260
|
if deduplicate:
|
258
261
|
deduplicate_success, deduplicate_msg = self.deduplicate(
|
259
|
-
begin
|
260
|
-
end
|
261
|
-
params
|
262
|
-
workers
|
263
|
-
debug
|
262
|
+
begin=begin,
|
263
|
+
end=end,
|
264
|
+
params=params,
|
265
|
+
workers=workers,
|
266
|
+
debug=debug,
|
264
267
|
**kwargs
|
265
268
|
)
|
266
269
|
return deduplicate_success, message + '\n\n' + deduplicate_msg
|
@@ -269,11 +272,11 @@ def verify(
|
|
269
272
|
message = get_chunks_success_message(bounds_success_tuples, header=message_header)
|
270
273
|
if deduplicate:
|
271
274
|
deduplicate_success, deduplicate_msg = self.deduplicate(
|
272
|
-
begin
|
273
|
-
end
|
274
|
-
params
|
275
|
-
workers
|
276
|
-
debug
|
275
|
+
begin=begin,
|
276
|
+
end=end,
|
277
|
+
params=params,
|
278
|
+
workers=workers,
|
279
|
+
debug=debug,
|
277
280
|
**kwargs
|
278
281
|
)
|
279
282
|
return deduplicate_success, message + '\n\n' + deduplicate_msg
|
@@ -417,7 +420,7 @@ def get_bound_time(self, debug: bool = False) -> Union[datetime, int, None]:
|
|
417
420
|
-------
|
418
421
|
A `datetime` or `int` corresponding to the
|
419
422
|
`begin` bound for verification and deduplication syncs.
|
420
|
-
"""
|
423
|
+
"""
|
421
424
|
bound_interval = self.get_bound_interval(debug=debug)
|
422
425
|
if bound_interval is None:
|
423
426
|
return None
|
meerschaum/utils/dataframe.py
CHANGED
@@ -235,9 +235,9 @@ def filter_unseen_df(
|
|
235
235
|
try:
|
236
236
|
for col, typ in dt_dtypes.items():
|
237
237
|
if col in old_df.columns:
|
238
|
-
old_df[col] = coerce_timezone(
|
238
|
+
old_df[col] = coerce_timezone(old_df[col])
|
239
239
|
if col in new_df.columns:
|
240
|
-
new_df[col] = coerce_timezone(
|
240
|
+
new_df[col] = coerce_timezone(new_df[col])
|
241
241
|
cast_dt_cols = False
|
242
242
|
except Exception as e:
|
243
243
|
warn(f"Could not cast datetime columns:\n{e}")
|
@@ -365,7 +365,7 @@ def filter_unseen_df(
|
|
365
365
|
def parse_df_datetimes(
|
366
366
|
df: 'pd.DataFrame',
|
367
367
|
ignore_cols: Optional[Iterable[str]] = None,
|
368
|
-
strip_timezone: bool =
|
368
|
+
strip_timezone: bool = False,
|
369
369
|
chunksize: Optional[int] = None,
|
370
370
|
dtype_backend: str = 'numpy_nullable',
|
371
371
|
debug: bool = False,
|
@@ -381,7 +381,7 @@ def parse_df_datetimes(
|
|
381
381
|
ignore_cols: Optional[Iterable[str]], default None
|
382
382
|
If provided, do not attempt to coerce these columns as datetimes.
|
383
383
|
|
384
|
-
strip_timezone: bool, default
|
384
|
+
strip_timezone: bool, default False
|
385
385
|
If `True`, remove the UTC `tzinfo` property.
|
386
386
|
|
387
387
|
chunksize: Optional[int], default None
|
@@ -486,7 +486,7 @@ def parse_df_datetimes(
|
|
486
486
|
if len(cols_to_inspect) == 0:
|
487
487
|
if debug:
|
488
488
|
dprint(f"All columns are ignored, skipping datetime detection...")
|
489
|
-
return df
|
489
|
+
return df.fillna(pandas.NA)
|
490
490
|
|
491
491
|
### apply regex to columns to determine which are ISO datetimes
|
492
492
|
iso_dt_regex = r'\d{4}-\d{2}-\d{2}.\d{2}\:\d{2}\:\d+'
|
@@ -499,7 +499,7 @@ def parse_df_datetimes(
|
|
499
499
|
if not datetime_cols:
|
500
500
|
if debug:
|
501
501
|
dprint("No columns detected as datetimes, returning...")
|
502
|
-
return df
|
502
|
+
return df.fillna(pandas.NA)
|
503
503
|
|
504
504
|
if debug:
|
505
505
|
dprint("Converting columns to datetimes: " + str(datetime_cols))
|
@@ -537,7 +537,7 @@ def parse_df_datetimes(
|
|
537
537
|
+ f"{traceback.format_exc()}"
|
538
538
|
)
|
539
539
|
|
540
|
-
return df
|
540
|
+
return df.fillna(pandas.NA)
|
541
541
|
|
542
542
|
|
543
543
|
def get_unhashable_cols(df: 'pd.DataFrame') -> List[str]:
|
@@ -689,6 +689,7 @@ def enforce_dtypes(
|
|
689
689
|
safe_copy: bool = True,
|
690
690
|
coerce_numeric: bool = True,
|
691
691
|
coerce_timezone: bool = True,
|
692
|
+
strip_timezone: bool = False,
|
692
693
|
debug: bool = False,
|
693
694
|
) -> 'pd.DataFrame':
|
694
695
|
"""
|
@@ -713,6 +714,10 @@ def enforce_dtypes(
|
|
713
714
|
coerce_timezone: bool, default True
|
714
715
|
If `True`, convert datetimes to UTC.
|
715
716
|
|
717
|
+
strip_timezone: bool, default False
|
718
|
+
If `coerce_timezone` and `strip_timezone` are `True`,
|
719
|
+
remove timezone information from datetimes.
|
720
|
+
|
716
721
|
debug: bool, default False
|
717
722
|
Verbosity toggle.
|
718
723
|
|
@@ -731,6 +736,8 @@ def enforce_dtypes(
|
|
731
736
|
attempt_cast_to_uuid,
|
732
737
|
coerce_timezone as _coerce_timezone,
|
733
738
|
)
|
739
|
+
pandas = mrsm.attempt_import('pandas')
|
740
|
+
is_dask = 'dask' in df.__module__
|
734
741
|
if safe_copy:
|
735
742
|
df = df.copy()
|
736
743
|
if len(df.columns) == 0:
|
@@ -814,7 +821,8 @@ def enforce_dtypes(
|
|
814
821
|
if debug:
|
815
822
|
dprint(f"Checking for datetime conversion: {datetime_cols}")
|
816
823
|
for col in datetime_cols:
|
817
|
-
|
824
|
+
if col in df.columns:
|
825
|
+
df[col] = _coerce_timezone(df[col], strip_utc=strip_timezone)
|
818
826
|
|
819
827
|
df_dtypes = {c: str(t) for c, t in df.dtypes.items()}
|
820
828
|
if are_dtypes_equal(df_dtypes, pipe_pandas_dtypes):
|