meerschaum 2.5.0__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/_internal/arguments/_parser.py +6 -1
- meerschaum/_internal/entry.py +16 -5
- meerschaum/actions/edit.py +6 -6
- meerschaum/actions/sql.py +12 -11
- meerschaum/api/dash/pages/login.py +17 -17
- meerschaum/api/dash/pipes.py +104 -13
- meerschaum/api/routes/_pipes.py +58 -40
- meerschaum/api/routes/_webterm.py +1 -0
- meerschaum/config/_edit.py +46 -19
- meerschaum/config/_read_config.py +20 -9
- meerschaum/config/_version.py +1 -1
- meerschaum/config/stack/__init__.py +1 -1
- meerschaum/config/static/__init__.py +1 -0
- meerschaum/connectors/api/_APIConnector.py +1 -0
- meerschaum/connectors/api/_pipes.py +39 -8
- meerschaum/connectors/sql/_SQLConnector.py +4 -3
- meerschaum/connectors/sql/_pipes.py +511 -118
- meerschaum/connectors/sql/_sql.py +55 -15
- meerschaum/connectors/valkey/_ValkeyConnector.py +3 -2
- meerschaum/connectors/valkey/_pipes.py +11 -5
- meerschaum/core/Pipe/__init__.py +27 -9
- meerschaum/core/Pipe/_attributes.py +181 -18
- meerschaum/core/Pipe/_clear.py +10 -8
- meerschaum/core/Pipe/_copy.py +2 -0
- meerschaum/core/Pipe/_data.py +65 -17
- meerschaum/core/Pipe/_deduplicate.py +30 -28
- meerschaum/core/Pipe/_dtypes.py +4 -4
- meerschaum/core/Pipe/_fetch.py +12 -10
- meerschaum/core/Pipe/_sync.py +28 -11
- meerschaum/core/Pipe/_verify.py +52 -49
- meerschaum/utils/dataframe.py +64 -34
- meerschaum/utils/dtypes/__init__.py +25 -6
- meerschaum/utils/dtypes/sql.py +76 -33
- meerschaum/utils/misc.py +57 -24
- meerschaum/utils/packages/_packages.py +2 -1
- meerschaum/utils/schedule.py +7 -5
- meerschaum/utils/sql.py +697 -44
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dist-info}/METADATA +5 -3
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dist-info}/RECORD +45 -45
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dist-info}/WHEEL +1 -1
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dist-info}/LICENSE +0 -0
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dist-info}/NOTICE +0 -0
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dist-info}/top_level.txt +0 -0
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dist-info}/zip-safe +0 -0
meerschaum/core/Pipe/_copy.py
CHANGED
meerschaum/core/Pipe/_data.py
CHANGED
@@ -23,8 +23,8 @@ def get_data(
|
|
23
23
|
self,
|
24
24
|
select_columns: Optional[List[str]] = None,
|
25
25
|
omit_columns: Optional[List[str]] = None,
|
26
|
-
begin: Union[datetime, int, None] = None,
|
27
|
-
end: Union[datetime, int, None] = None,
|
26
|
+
begin: Union[datetime, int, str, None] = None,
|
27
|
+
end: Union[datetime, int, str, None] = None,
|
28
28
|
params: Optional[Dict[str, Any]] = None,
|
29
29
|
as_iterator: bool = False,
|
30
30
|
as_chunks: bool = False,
|
@@ -48,12 +48,12 @@ def get_data(
|
|
48
48
|
omit_columns: Optional[List[str]], default None
|
49
49
|
If provided, remove these columns from the selection.
|
50
50
|
|
51
|
-
begin: Union[datetime, int, None], default None
|
51
|
+
begin: Union[datetime, int, str, None], default None
|
52
52
|
Lower bound datetime to begin searching for data (inclusive).
|
53
53
|
Translates to a `WHERE` clause like `WHERE datetime >= begin`.
|
54
54
|
Defaults to `None`.
|
55
55
|
|
56
|
-
end: Union[datetime, int, None], default None
|
56
|
+
end: Union[datetime, int, str, None], default None
|
57
57
|
Upper bound datetime to stop searching for data (inclusive).
|
58
58
|
Translates to a `WHERE` clause like `WHERE datetime < end`.
|
59
59
|
Defaults to `None`.
|
@@ -105,11 +105,12 @@ def get_data(
|
|
105
105
|
from meerschaum.utils.venv import Venv
|
106
106
|
from meerschaum.connectors import get_connector_plugin
|
107
107
|
from meerschaum.utils.misc import iterate_chunks, items_str
|
108
|
-
from meerschaum.utils.dtypes import to_pandas_dtype
|
108
|
+
from meerschaum.utils.dtypes import to_pandas_dtype, coerce_timezone
|
109
109
|
from meerschaum.utils.dataframe import add_missing_cols_to_df, df_is_chunk_generator
|
110
110
|
from meerschaum.utils.packages import attempt_import
|
111
111
|
dd = attempt_import('dask.dataframe') if as_dask else None
|
112
112
|
dask = attempt_import('dask') if as_dask else None
|
113
|
+
dateutil_parser = attempt_import('dateutil.parser')
|
113
114
|
|
114
115
|
if select_columns == '*':
|
115
116
|
select_columns = None
|
@@ -119,12 +120,13 @@ def get_data(
|
|
119
120
|
if isinstance(omit_columns, str):
|
120
121
|
omit_columns = [omit_columns]
|
121
122
|
|
123
|
+
begin, end = self.parse_date_bounds(begin, end)
|
122
124
|
as_iterator = as_iterator or as_chunks
|
125
|
+
dt_col = self.columns.get('datetime', None)
|
123
126
|
|
124
127
|
def _sort_df(_df):
|
125
128
|
if df_is_chunk_generator(_df):
|
126
129
|
return _df
|
127
|
-
dt_col = self.columns.get('datetime', None)
|
128
130
|
indices = [] if dt_col not in _df.columns else [dt_col]
|
129
131
|
non_dt_cols = [
|
130
132
|
col
|
@@ -311,16 +313,8 @@ def _get_data_as_iterator(
|
|
311
313
|
Return a pipe's data as a generator.
|
312
314
|
"""
|
313
315
|
from meerschaum.utils.misc import round_time
|
314
|
-
|
315
|
-
|
316
|
-
if parse_begin or parse_end:
|
317
|
-
from meerschaum.utils.packages import attempt_import
|
318
|
-
dateutil_parser = attempt_import('dateutil.parser')
|
319
|
-
if parse_begin:
|
320
|
-
begin = dateutil_parser.parse(begin)
|
321
|
-
if parse_end:
|
322
|
-
end = dateutil_parser.parse(end)
|
323
|
-
|
316
|
+
from meerschaum.utils.dtypes import coerce_timezone
|
317
|
+
begin, end = self.parse_date_bounds(begin, end)
|
324
318
|
if not self.exists(debug=debug):
|
325
319
|
return
|
326
320
|
|
@@ -332,11 +326,15 @@ def _get_data_as_iterator(
|
|
332
326
|
if begin is not None
|
333
327
|
else self.get_sync_time(round_down=False, newest=False, params=params, debug=debug)
|
334
328
|
) if dt_col else None
|
329
|
+
if isinstance(min_dt, datetime):
|
330
|
+
min_dt = coerce_timezone(min_dt)
|
335
331
|
max_dt = (
|
336
332
|
end
|
337
333
|
if end is not None
|
338
334
|
else self.get_sync_time(round_down=False, newest=True, params=params, debug=debug)
|
339
335
|
) if dt_col else None
|
336
|
+
if isinstance(max_dt, datetime):
|
337
|
+
max_dt = coerce_timezone(max_dt)
|
340
338
|
|
341
339
|
### We want to search just past the maximum value.
|
342
340
|
if end is None:
|
@@ -450,6 +448,8 @@ def get_backtrack_data(
|
|
450
448
|
if not self.exists(debug=debug):
|
451
449
|
return None
|
452
450
|
|
451
|
+
begin = self.parse_date_bounds(begin)
|
452
|
+
|
453
453
|
backtrack_interval = self.get_backtrack_interval(debug=debug)
|
454
454
|
if backtrack_minutes is None:
|
455
455
|
backtrack_minutes = (
|
@@ -550,6 +550,7 @@ def get_rowcount(
|
|
550
550
|
from meerschaum.utils.venv import Venv
|
551
551
|
from meerschaum.connectors import get_connector_plugin
|
552
552
|
|
553
|
+
begin, end = self.parse_date_bounds(begin, end)
|
553
554
|
connector = self.instance_connector if not remote else self.connector
|
554
555
|
try:
|
555
556
|
with Venv(get_connector_plugin(connector)):
|
@@ -607,7 +608,7 @@ def get_chunk_interval(
|
|
607
608
|
if dt_col is None:
|
608
609
|
return timedelta(minutes=chunk_minutes)
|
609
610
|
|
610
|
-
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
|
611
|
+
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
611
612
|
if 'int' in dt_dtype.lower():
|
612
613
|
return chunk_minutes
|
613
614
|
return timedelta(minutes=chunk_minutes)
|
@@ -664,6 +665,8 @@ def get_chunk_bounds(
|
|
664
665
|
if begin is None and end is None:
|
665
666
|
return [(None, None)]
|
666
667
|
|
668
|
+
begin, end = self.parse_date_bounds(begin, end)
|
669
|
+
|
667
670
|
### Set the chunk interval under `pipe.parameters['verify']['chunk_minutes']`.
|
668
671
|
chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug)
|
669
672
|
|
@@ -695,3 +698,48 @@ def get_chunk_bounds(
|
|
695
698
|
chunk_bounds = chunk_bounds + [(end, None)]
|
696
699
|
|
697
700
|
return chunk_bounds
|
701
|
+
|
702
|
+
|
703
|
+
def parse_date_bounds(self, *dt_vals: Union[datetime, int, None]) -> Union[
|
704
|
+
datetime,
|
705
|
+
int,
|
706
|
+
str,
|
707
|
+
None,
|
708
|
+
Tuple[Union[datetime, int, str, None]]
|
709
|
+
]:
|
710
|
+
"""
|
711
|
+
Given a date bound (begin, end), coerce a timezone if necessary.
|
712
|
+
"""
|
713
|
+
from meerschaum.utils.misc import is_int
|
714
|
+
from meerschaum.utils.dtypes import coerce_timezone
|
715
|
+
from meerschaum.utils.warnings import warn
|
716
|
+
dateutil_parser = mrsm.attempt_import('dateutil.parser')
|
717
|
+
|
718
|
+
def _parse_date_bound(dt_val):
|
719
|
+
if dt_val is None:
|
720
|
+
return None
|
721
|
+
|
722
|
+
if isinstance(dt_val, int):
|
723
|
+
return dt_val
|
724
|
+
|
725
|
+
if dt_val == '':
|
726
|
+
return ''
|
727
|
+
|
728
|
+
if is_int(dt_val):
|
729
|
+
return int(dt_val)
|
730
|
+
|
731
|
+
if isinstance(dt_val, str):
|
732
|
+
try:
|
733
|
+
dt_val = dateutil_parser.parse(dt_val)
|
734
|
+
except Exception as e:
|
735
|
+
warn(f"Could not parse '{dt_val}' as datetime:\n{e}")
|
736
|
+
return None
|
737
|
+
|
738
|
+
dt_col = self.columns.get('datetime', None)
|
739
|
+
dt_typ = str(self.dtypes.get(dt_col, 'datetime64[ns, UTC]'))
|
740
|
+
return coerce_timezone(dt_val, strip_utc=('utc' not in dt_typ.lower()))
|
741
|
+
|
742
|
+
bounds = tuple(_parse_date_bound(dt_val) for dt_val in dt_vals)
|
743
|
+
if len(bounds) == 1:
|
744
|
+
return bounds[0]
|
745
|
+
return bounds
|
@@ -65,14 +65,16 @@ def deduplicate(
|
|
65
65
|
from meerschaum.connectors import get_connector_plugin
|
66
66
|
from meerschaum.utils.pool import get_pool
|
67
67
|
|
68
|
+
begin, end = self.parse_date_bounds(begin, end)
|
69
|
+
|
68
70
|
if self.cache_pipe is not None:
|
69
71
|
success, msg = self.cache_pipe.deduplicate(
|
70
|
-
begin
|
71
|
-
end
|
72
|
-
params
|
73
|
-
bounded
|
74
|
-
debug
|
75
|
-
_use_instance_method
|
72
|
+
begin=begin,
|
73
|
+
end=end,
|
74
|
+
params=params,
|
75
|
+
bounded=bounded,
|
76
|
+
debug=debug,
|
77
|
+
_use_instance_method=_use_instance_method,
|
76
78
|
**kwargs
|
77
79
|
)
|
78
80
|
if not success:
|
@@ -86,11 +88,11 @@ def deduplicate(
|
|
86
88
|
if hasattr(self.instance_connector, 'deduplicate_pipe'):
|
87
89
|
return self.instance_connector.deduplicate_pipe(
|
88
90
|
self,
|
89
|
-
begin
|
90
|
-
end
|
91
|
-
params
|
92
|
-
bounded
|
93
|
-
debug
|
91
|
+
begin=begin,
|
92
|
+
end=end,
|
93
|
+
params=params,
|
94
|
+
bounded=bounded,
|
95
|
+
debug=debug,
|
94
96
|
**kwargs
|
95
97
|
)
|
96
98
|
|
@@ -117,33 +119,33 @@ def deduplicate(
|
|
117
119
|
)
|
118
120
|
|
119
121
|
chunk_bounds = self.get_chunk_bounds(
|
120
|
-
bounded
|
121
|
-
begin
|
122
|
-
end
|
123
|
-
chunk_interval
|
124
|
-
debug
|
122
|
+
bounded=bounded,
|
123
|
+
begin=begin,
|
124
|
+
end=end,
|
125
|
+
chunk_interval=chunk_interval,
|
126
|
+
debug=debug,
|
125
127
|
)
|
126
128
|
|
127
129
|
indices = [col for col in self.columns.values() if col]
|
128
130
|
if not indices:
|
129
|
-
return False,
|
131
|
+
return False, "Cannot deduplicate without index columns."
|
130
132
|
dt_col = self.columns.get('datetime', None)
|
131
133
|
|
132
134
|
def process_chunk_bounds(bounds) -> Tuple[
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
135
|
+
Tuple[
|
136
|
+
Union[datetime, int, None],
|
137
|
+
Union[datetime, int, None]
|
138
|
+
],
|
139
|
+
SuccessTuple
|
140
|
+
]:
|
139
141
|
### Only selecting the index values here to keep bandwidth down.
|
140
142
|
chunk_begin, chunk_end = bounds
|
141
143
|
chunk_df = self.get_data(
|
142
|
-
select_columns
|
143
|
-
begin
|
144
|
-
end
|
145
|
-
params
|
146
|
-
debug
|
144
|
+
select_columns=indices,
|
145
|
+
begin=chunk_begin,
|
146
|
+
end=chunk_end,
|
147
|
+
params=params,
|
148
|
+
debug=debug,
|
147
149
|
)
|
148
150
|
if chunk_df is None:
|
149
151
|
return bounds, (True, "")
|
meerschaum/core/Pipe/_dtypes.py
CHANGED
@@ -101,18 +101,18 @@ def infer_dtypes(self, persist: bool = False, debug: bool = False) -> Dict[str,
|
|
101
101
|
dt_col = self.columns.get('datetime', None)
|
102
102
|
if dt_col:
|
103
103
|
if not self.parameters.get('dtypes', {}).get(dt_col, None):
|
104
|
-
dtypes[dt_col] = 'datetime64[ns]'
|
104
|
+
dtypes[dt_col] = 'datetime64[ns, UTC]'
|
105
105
|
return dtypes
|
106
106
|
|
107
|
-
from meerschaum.utils.sql import
|
108
|
-
from meerschaum.utils.
|
107
|
+
from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
|
108
|
+
from meerschaum.utils.dtypes import to_pandas_dtype
|
109
109
|
columns_types = self.get_columns_types(debug=debug)
|
110
110
|
|
111
111
|
### NOTE: get_columns_types() may return either the types as
|
112
112
|
### PostgreSQL- or Pandas-style.
|
113
113
|
dtypes = {
|
114
114
|
c: (
|
115
|
-
|
115
|
+
get_pd_type_from_db_type(t, allow_custom_dtypes=True)
|
116
116
|
if str(t).isupper()
|
117
117
|
else to_pandas_dtype(t)
|
118
118
|
)
|
meerschaum/core/Pipe/_fetch.py
CHANGED
@@ -18,14 +18,14 @@ if TYPE_CHECKING:
|
|
18
18
|
pd = mrsm.attempt_import('pandas')
|
19
19
|
|
20
20
|
def fetch(
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
21
|
+
self,
|
22
|
+
begin: Union[datetime, int, str, None] = '',
|
23
|
+
end: Union[datetime, int, None] = None,
|
24
|
+
check_existing: bool = True,
|
25
|
+
sync_chunks: bool = False,
|
26
|
+
debug: bool = False,
|
27
|
+
**kw: Any
|
28
|
+
) -> Union['pd.DataFrame', Iterator['pd.DataFrame'], None]:
|
29
29
|
"""
|
30
30
|
Fetch a Pipe's latest data from its connector.
|
31
31
|
|
@@ -76,6 +76,8 @@ def fetch(
|
|
76
76
|
chunk_message = '\n' + chunk_label + '\n' + chunk_message
|
77
77
|
return chunk_success, chunk_message
|
78
78
|
|
79
|
+
begin, end = self.parse_date_bounds(begin, end)
|
80
|
+
|
79
81
|
with mrsm.Venv(get_connector_plugin(self.connector)):
|
80
82
|
_args, _kwargs = filter_arguments(
|
81
83
|
self.connector.fetch,
|
@@ -125,7 +127,7 @@ def get_backtrack_interval(
|
|
125
127
|
if dt_col is None:
|
126
128
|
return backtrack_interval
|
127
129
|
|
128
|
-
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
|
130
|
+
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
129
131
|
if 'int' in dt_dtype.lower():
|
130
132
|
return backtrack_minutes
|
131
133
|
|
@@ -164,6 +166,6 @@ def _determine_begin(
|
|
164
166
|
backtrack_interval = timedelta(minutes=backtrack_interval)
|
165
167
|
try:
|
166
168
|
return sync_time - backtrack_interval
|
167
|
-
except Exception
|
169
|
+
except Exception:
|
168
170
|
warn(f"Unable to substract backtrack interval {backtrack_interval} from {sync_time}.")
|
169
171
|
return sync_time
|
meerschaum/core/Pipe/_sync.py
CHANGED
@@ -141,6 +141,7 @@ def sync(
|
|
141
141
|
chunksize = None
|
142
142
|
sync_chunks = False
|
143
143
|
|
144
|
+
begin, end = self.parse_date_bounds(begin, end)
|
144
145
|
kw.update({
|
145
146
|
'begin': begin,
|
146
147
|
'end': end,
|
@@ -460,7 +461,7 @@ def get_sync_time(
|
|
460
461
|
apply_backtrack_interval: bool = False,
|
461
462
|
round_down: bool = False,
|
462
463
|
debug: bool = False
|
463
|
-
) -> Union['datetime', None]:
|
464
|
+
) -> Union['datetime', int, None]:
|
464
465
|
"""
|
465
466
|
Get the most recent datetime value for a Pipe.
|
466
467
|
|
@@ -485,7 +486,7 @@ def get_sync_time(
|
|
485
486
|
|
486
487
|
Returns
|
487
488
|
-------
|
488
|
-
A `datetime`
|
489
|
+
A `datetime` or int, if the pipe exists, otherwise `None`.
|
489
490
|
|
490
491
|
"""
|
491
492
|
from meerschaum.utils.venv import Venv
|
@@ -510,13 +511,13 @@ def get_sync_time(
|
|
510
511
|
except Exception as e:
|
511
512
|
warn(f"Failed to apply backtrack interval:\n{e}")
|
512
513
|
|
513
|
-
return sync_time
|
514
|
+
return self.parse_date_bounds(sync_time)
|
514
515
|
|
515
516
|
|
516
517
|
def exists(
|
517
|
-
|
518
|
-
|
519
|
-
|
518
|
+
self,
|
519
|
+
debug: bool = False
|
520
|
+
) -> bool:
|
520
521
|
"""
|
521
522
|
See if a Pipe's table exists.
|
522
523
|
|
@@ -549,7 +550,11 @@ def exists(
|
|
549
550
|
return _exists
|
550
551
|
|
551
552
|
with Venv(get_connector_plugin(self.instance_connector)):
|
552
|
-
_exists =
|
553
|
+
_exists = (
|
554
|
+
self.instance_connector.pipe_exists(pipe=self, debug=debug)
|
555
|
+
if hasattr(self.instance_connector, 'pipe_exists')
|
556
|
+
else False
|
557
|
+
)
|
553
558
|
|
554
559
|
self.__dict__['_exists'] = _exists
|
555
560
|
self.__dict__['_exists_timestamp'] = now
|
@@ -624,6 +629,18 @@ def filter_existing(
|
|
624
629
|
merge = pd.merge
|
625
630
|
NA = pd.NA
|
626
631
|
|
632
|
+
primary_key = self.columns.get('primary', None)
|
633
|
+
autoincrement = self.parameters.get('autoincrement', False)
|
634
|
+
pipe_columns = self.columns.copy()
|
635
|
+
|
636
|
+
if primary_key and autoincrement and df is not None and primary_key in df.columns:
|
637
|
+
if safe_copy:
|
638
|
+
df = df.copy()
|
639
|
+
safe_copy = False
|
640
|
+
if df[primary_key].isnull().all():
|
641
|
+
del df[primary_key]
|
642
|
+
_ = self.columns.pop(primary_key, None)
|
643
|
+
|
627
644
|
def get_empty_df():
|
628
645
|
empty_df = pd.DataFrame([])
|
629
646
|
dtypes = dict(df.dtypes) if df is not None else {}
|
@@ -643,8 +660,8 @@ def filter_existing(
|
|
643
660
|
|
644
661
|
### begin is the oldest data in the new dataframe
|
645
662
|
begin, end = None, None
|
646
|
-
dt_col =
|
647
|
-
dt_type = self.dtypes.get(dt_col, 'datetime64[ns]') if dt_col else None
|
663
|
+
dt_col = pipe_columns.get('datetime', None)
|
664
|
+
dt_type = self.dtypes.get(dt_col, 'datetime64[ns, UTC]') if dt_col else None
|
648
665
|
try:
|
649
666
|
min_dt_val = df[dt_col].min(skipna=True) if dt_col else None
|
650
667
|
if is_dask and min_dt_val is not None:
|
@@ -713,7 +730,7 @@ def filter_existing(
|
|
713
730
|
|
714
731
|
unique_index_vals = {
|
715
732
|
col: df[col].unique()
|
716
|
-
for col in
|
733
|
+
for col in pipe_columns
|
717
734
|
if col in df.columns and col != dt_col
|
718
735
|
} if not date_bound_only else {}
|
719
736
|
filter_params_index_limit = get_config('pipes', 'sync', 'filter_params_index_limit')
|
@@ -749,7 +766,7 @@ def filter_existing(
|
|
749
766
|
|
750
767
|
### Separate new rows from changed ones.
|
751
768
|
on_cols = [
|
752
|
-
col for col_key, col in
|
769
|
+
col for col_key, col in pipe_columns.items()
|
753
770
|
if (
|
754
771
|
col
|
755
772
|
and
|
meerschaum/core/Pipe/_verify.py
CHANGED
@@ -11,6 +11,7 @@ from meerschaum.utils.typing import SuccessTuple, Any, Optional, Union, Tuple, L
|
|
11
11
|
from meerschaum.utils.warnings import warn, info
|
12
12
|
from meerschaum.utils.debug import dprint
|
13
13
|
|
14
|
+
|
14
15
|
def verify(
|
15
16
|
self,
|
16
17
|
begin: Union[datetime, int, None] = None,
|
@@ -84,6 +85,8 @@ def verify(
|
|
84
85
|
if bounded and end is None:
|
85
86
|
end = self.get_sync_time(newest=True, debug=debug)
|
86
87
|
|
88
|
+
begin, end = self.parse_date_bounds(begin, end)
|
89
|
+
|
87
90
|
if bounded and end is not None:
|
88
91
|
end += (
|
89
92
|
timedelta(minutes=1)
|
@@ -98,45 +101,45 @@ def verify(
|
|
98
101
|
|
99
102
|
if cannot_determine_bounds:
|
100
103
|
sync_success, sync_msg = self.sync(
|
101
|
-
begin
|
102
|
-
end
|
103
|
-
params
|
104
|
-
workers
|
105
|
-
debug
|
104
|
+
begin=begin,
|
105
|
+
end=end,
|
106
|
+
params=params,
|
107
|
+
workers=workers,
|
108
|
+
debug=debug,
|
106
109
|
**kwargs
|
107
110
|
)
|
108
111
|
if not sync_success:
|
109
112
|
return sync_success, sync_msg
|
113
|
+
|
110
114
|
if deduplicate:
|
111
115
|
return self.deduplicate(
|
112
|
-
begin
|
113
|
-
end
|
114
|
-
params
|
115
|
-
workers
|
116
|
-
debug
|
116
|
+
begin=begin,
|
117
|
+
end=end,
|
118
|
+
params=params,
|
119
|
+
workers=workers,
|
120
|
+
debug=debug,
|
117
121
|
**kwargs
|
118
122
|
)
|
119
123
|
return sync_success, sync_msg
|
120
124
|
|
121
|
-
|
122
125
|
chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug)
|
123
126
|
chunk_bounds = self.get_chunk_bounds(
|
124
|
-
begin
|
125
|
-
end
|
126
|
-
chunk_interval
|
127
|
-
bounded
|
128
|
-
debug
|
127
|
+
begin=begin,
|
128
|
+
end=end,
|
129
|
+
chunk_interval=chunk_interval,
|
130
|
+
bounded=bounded,
|
131
|
+
debug=debug,
|
129
132
|
)
|
130
133
|
|
131
134
|
### Consider it a success if no chunks need to be verified.
|
132
135
|
if not chunk_bounds:
|
133
136
|
if deduplicate:
|
134
137
|
return self.deduplicate(
|
135
|
-
begin
|
136
|
-
end
|
137
|
-
params
|
138
|
-
workers
|
139
|
-
debug
|
138
|
+
begin=begin,
|
139
|
+
end=end,
|
140
|
+
params=params,
|
141
|
+
workers=workers,
|
142
|
+
debug=debug,
|
140
143
|
**kwargs
|
141
144
|
)
|
142
145
|
return True, f"Could not determine chunks between '{begin}' and '{end}'; nothing to do."
|
@@ -175,21 +178,21 @@ def verify(
|
|
175
178
|
### }
|
176
179
|
bounds_success_tuples = {}
|
177
180
|
def process_chunk_bounds(
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
181
|
+
chunk_begin_and_end: Tuple[
|
182
|
+
Union[int, datetime],
|
183
|
+
Union[int, datetime]
|
184
|
+
]
|
185
|
+
):
|
183
186
|
if chunk_begin_and_end in bounds_success_tuples:
|
184
187
|
return chunk_begin_and_end, bounds_success_tuples[chunk_begin_and_end]
|
185
188
|
|
186
189
|
chunk_begin, chunk_end = chunk_begin_and_end
|
187
190
|
return chunk_begin_and_end, self.sync(
|
188
|
-
begin
|
189
|
-
end
|
190
|
-
params
|
191
|
-
workers
|
192
|
-
debug
|
191
|
+
begin=chunk_begin,
|
192
|
+
end=chunk_end,
|
193
|
+
params=params,
|
194
|
+
workers=workers,
|
195
|
+
debug=debug,
|
193
196
|
**kwargs
|
194
197
|
)
|
195
198
|
|
@@ -216,11 +219,11 @@ def verify(
|
|
216
219
|
msg = get_chunks_success_message(bounds_success_tuples, header=message_header)
|
217
220
|
if deduplicate:
|
218
221
|
deduplicate_success, deduplicate_msg = self.deduplicate(
|
219
|
-
begin
|
220
|
-
end
|
221
|
-
params
|
222
|
-
workers
|
223
|
-
debug
|
222
|
+
begin=begin,
|
223
|
+
end=end,
|
224
|
+
params=params,
|
225
|
+
workers=workers,
|
226
|
+
debug=debug,
|
224
227
|
**kwargs
|
225
228
|
)
|
226
229
|
return deduplicate_success, msg + '\n\n' + deduplicate_msg
|
@@ -239,7 +242,7 @@ def verify(
|
|
239
242
|
warn(
|
240
243
|
f"Will resync the following failed chunks:\n "
|
241
244
|
+ '\n '.join(bounds_to_print),
|
242
|
-
stack
|
245
|
+
stack=False,
|
243
246
|
)
|
244
247
|
|
245
248
|
retry_bounds_success_tuples = dict(pool.map(process_chunk_bounds, chunk_bounds_to_resync))
|
@@ -256,11 +259,11 @@ def verify(
|
|
256
259
|
)
|
257
260
|
if deduplicate:
|
258
261
|
deduplicate_success, deduplicate_msg = self.deduplicate(
|
259
|
-
begin
|
260
|
-
end
|
261
|
-
params
|
262
|
-
workers
|
263
|
-
debug
|
262
|
+
begin=begin,
|
263
|
+
end=end,
|
264
|
+
params=params,
|
265
|
+
workers=workers,
|
266
|
+
debug=debug,
|
264
267
|
**kwargs
|
265
268
|
)
|
266
269
|
return deduplicate_success, message + '\n\n' + deduplicate_msg
|
@@ -269,11 +272,11 @@ def verify(
|
|
269
272
|
message = get_chunks_success_message(bounds_success_tuples, header=message_header)
|
270
273
|
if deduplicate:
|
271
274
|
deduplicate_success, deduplicate_msg = self.deduplicate(
|
272
|
-
begin
|
273
|
-
end
|
274
|
-
params
|
275
|
-
workers
|
276
|
-
debug
|
275
|
+
begin=begin,
|
276
|
+
end=end,
|
277
|
+
params=params,
|
278
|
+
workers=workers,
|
279
|
+
debug=debug,
|
277
280
|
**kwargs
|
278
281
|
)
|
279
282
|
return deduplicate_success, message + '\n\n' + deduplicate_msg
|
@@ -394,7 +397,7 @@ def get_bound_interval(self, debug: bool = False) -> Union[timedelta, int, None]
|
|
394
397
|
if not dt_col:
|
395
398
|
return bound_time_value
|
396
399
|
|
397
|
-
dt_typ = self.dtypes.get(dt_col, 'datetime64[ns]')
|
400
|
+
dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
398
401
|
if 'int' in dt_typ.lower():
|
399
402
|
return int(bound_time_value)
|
400
403
|
|
@@ -417,7 +420,7 @@ def get_bound_time(self, debug: bool = False) -> Union[datetime, int, None]:
|
|
417
420
|
-------
|
418
421
|
A `datetime` or `int` corresponding to the
|
419
422
|
`begin` bound for verification and deduplication syncs.
|
420
|
-
"""
|
423
|
+
"""
|
421
424
|
bound_interval = self.get_bound_interval(debug=debug)
|
422
425
|
if bound_interval is None:
|
423
426
|
return None
|