meerschaum 2.0.0rc7__py3-none-any.whl → 2.0.0rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/actions/__init__.py +97 -48
- meerschaum/actions/bootstrap.py +1 -1
- meerschaum/actions/clear.py +1 -1
- meerschaum/actions/deduplicate.py +1 -1
- meerschaum/actions/delete.py +8 -7
- meerschaum/actions/drop.py +1 -10
- meerschaum/actions/edit.py +1 -1
- meerschaum/actions/install.py +1 -1
- meerschaum/actions/pause.py +1 -1
- meerschaum/actions/register.py +1 -1
- meerschaum/actions/setup.py +1 -1
- meerschaum/actions/show.py +1 -1
- meerschaum/actions/start.py +18 -7
- meerschaum/actions/stop.py +5 -4
- meerschaum/actions/sync.py +3 -1
- meerschaum/actions/uninstall.py +1 -1
- meerschaum/actions/upgrade.py +1 -1
- meerschaum/actions/verify.py +54 -3
- meerschaum/config/_default.py +1 -1
- meerschaum/config/_formatting.py +26 -0
- meerschaum/config/_jobs.py +28 -5
- meerschaum/config/_paths.py +21 -5
- meerschaum/config/_version.py +1 -1
- meerschaum/connectors/api/_fetch.py +40 -38
- meerschaum/connectors/api/_pipes.py +10 -17
- meerschaum/connectors/sql/_fetch.py +29 -11
- meerschaum/connectors/sql/_pipes.py +1 -2
- meerschaum/core/Pipe/__init__.py +31 -10
- meerschaum/core/Pipe/_data.py +23 -13
- meerschaum/core/Pipe/_deduplicate.py +44 -23
- meerschaum/core/Pipe/_dtypes.py +2 -1
- meerschaum/core/Pipe/_fetch.py +29 -0
- meerschaum/core/Pipe/_sync.py +25 -18
- meerschaum/core/Pipe/_verify.py +60 -25
- meerschaum/plugins/__init__.py +3 -0
- meerschaum/utils/daemon/Daemon.py +108 -27
- meerschaum/utils/daemon/__init__.py +35 -1
- meerschaum/utils/dataframe.py +2 -0
- meerschaum/utils/formatting/__init__.py +144 -1
- meerschaum/utils/formatting/_pipes.py +28 -5
- meerschaum/utils/misc.py +184 -188
- meerschaum/utils/packages/__init__.py +1 -1
- meerschaum/utils/packages/_packages.py +1 -0
- {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/METADATA +4 -1
- {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/RECORD +51 -51
- {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/LICENSE +0 -0
- {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/NOTICE +0 -0
- {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/WHEEL +0 -0
- {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/top_level.txt +0 -0
- {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/zip-safe +0 -0
@@ -7,60 +7,62 @@ Fetch Pipe data via the API connector
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
|
-
import datetime
|
10
|
+
from datetime import datetime
|
11
11
|
import copy
|
12
|
-
|
12
|
+
import meerschaum as mrsm
|
13
|
+
from meerschaum.utils.typing import Any, Optional, Dict, Iterator, Union
|
13
14
|
|
14
15
|
def fetch(
|
15
16
|
self,
|
16
|
-
pipe:
|
17
|
-
begin:
|
18
|
-
end:
|
17
|
+
pipe: mrsm.Pipe,
|
18
|
+
begin: Union[datetime, str, int] = '',
|
19
|
+
end: Union[datetime, int] = None,
|
19
20
|
params: Optional[Dict, Any] = None,
|
20
21
|
debug: bool = False,
|
21
22
|
**kw: Any
|
22
|
-
) ->
|
23
|
+
) -> Iterator['pd.DataFrame']:
|
23
24
|
"""Get the Pipe data from the remote Pipe."""
|
24
25
|
from meerschaum.utils.debug import dprint
|
25
26
|
from meerschaum.utils.warnings import warn, error
|
26
|
-
from meerschaum.config.static import _static_config
|
27
27
|
from meerschaum.config._patch import apply_patch_to_config
|
28
28
|
|
29
|
-
|
30
|
-
|
29
|
+
fetch_params = pipe.parameters.get('fetch', {})
|
30
|
+
if not fetch_params:
|
31
|
+
warn(f"Missing 'fetch' parameters for {pipe}.", stack=False)
|
31
32
|
return None
|
32
33
|
|
33
|
-
|
34
|
+
pipe_meta = fetch_params.get('pipe', {})
|
35
|
+
### Legacy: check for `connector_keys`, etc. at the root.
|
36
|
+
if not pipe_meta:
|
37
|
+
ck, mk, lk = (
|
38
|
+
fetch_params.get('connector_keys', None),
|
39
|
+
fetch_params.get('metric_key', None),
|
40
|
+
fetch_params.get('location_key', None),
|
41
|
+
)
|
42
|
+
if not ck or not mk:
|
43
|
+
warn(f"Missing `fetch:pipe` keys for {pipe}.", stack=False)
|
44
|
+
return None
|
34
45
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
warn(f"Missing metric_key in fetch parameters for Pipe '{pipe}'", stack=False)
|
41
|
-
return None
|
42
|
-
remote_metric_key = instructions.get('metric_key', None)
|
43
|
-
remote_location_key = instructions.get('location_key', None)
|
44
|
-
if begin is None:
|
45
|
-
begin = pipe.sync_time
|
46
|
+
pipe_meta.update({
|
47
|
+
'connector': ck,
|
48
|
+
'metric': mk,
|
49
|
+
'location': lk,
|
50
|
+
})
|
46
51
|
|
47
|
-
|
48
|
-
|
52
|
+
pipe_meta['instance'] = self
|
53
|
+
source_pipe = mrsm.Pipe(**pipe_meta)
|
49
54
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
remote_location_key,
|
55
|
-
mrsm_instance = self
|
56
|
-
)
|
57
|
-
begin = (
|
58
|
-
begin if not (isinstance(begin, str) and begin == '')
|
59
|
-
else pipe.get_sync_time(debug=debug)
|
60
|
-
)
|
55
|
+
_params = copy.deepcopy(params) if params is not None else {}
|
56
|
+
_params = apply_patch_to_config(_params, fetch_params.get('params', {}))
|
57
|
+
select_columns = fetch_params.get('select_columns', [])
|
58
|
+
omit_columns = fetch_params.get('omit_columns', [])
|
61
59
|
|
62
|
-
return
|
63
|
-
|
64
|
-
|
65
|
-
|
60
|
+
return source_pipe.get_data(
|
61
|
+
select_columns = select_columns,
|
62
|
+
omit_columns = omit_columns,
|
63
|
+
begin = begin,
|
64
|
+
end = end,
|
65
|
+
params = _params,
|
66
|
+
debug = debug,
|
67
|
+
as_iterator = True,
|
66
68
|
)
|
@@ -7,6 +7,9 @@ Register or fetch Pipes from the API
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
|
+
import time
|
11
|
+
import json
|
12
|
+
from io import StringIO
|
10
13
|
from datetime import datetime
|
11
14
|
from meerschaum.utils.debug import dprint
|
12
15
|
from meerschaum.utils.warnings import warn, error
|
@@ -123,7 +126,6 @@ def fetch_pipes_keys(
|
|
123
126
|
A list of tuples containing pipes' keys.
|
124
127
|
"""
|
125
128
|
from meerschaum.config.static import STATIC_CONFIG
|
126
|
-
import json
|
127
129
|
if connector_keys is None:
|
128
130
|
connector_keys = []
|
129
131
|
if metric_keys is None:
|
@@ -162,14 +164,11 @@ def sync_pipe(
|
|
162
164
|
debug: bool = False,
|
163
165
|
**kw: Any
|
164
166
|
) -> SuccessTuple:
|
165
|
-
"""
|
166
|
-
If Pipe does not exist, it is registered with supplied metadata.
|
167
|
-
"""
|
167
|
+
"""Sync a DataFrame into a Pipe."""
|
168
168
|
from meerschaum.utils.debug import dprint
|
169
169
|
from meerschaum.utils.misc import json_serialize_datetime
|
170
170
|
from meerschaum.config import get_config
|
171
171
|
from meerschaum.utils.packages import attempt_import
|
172
|
-
import json, time
|
173
172
|
begin = time.time()
|
174
173
|
more_itertools = attempt_import('more_itertools')
|
175
174
|
if df is None:
|
@@ -185,14 +184,14 @@ def sync_pipe(
|
|
185
184
|
|
186
185
|
df = json.loads(df) if isinstance(df, str) else df
|
187
186
|
|
188
|
-
|
189
|
-
_chunksize : Optional[int] = (1 if chunksize is None else (
|
187
|
+
_chunksize: Optional[int] = (1 if chunksize is None else (
|
190
188
|
get_config('system', 'connectors', 'sql', 'chunksize') if chunksize == -1
|
191
189
|
else chunksize
|
192
190
|
))
|
193
|
-
keys
|
191
|
+
keys: list = list(df.keys())
|
194
192
|
chunks = []
|
195
193
|
if hasattr(df, 'index'):
|
194
|
+
df = df.reset_index(drop=True)
|
196
195
|
rowcount = len(df)
|
197
196
|
chunks = [df.iloc[i] for i in more_itertools.chunked(df.index, _chunksize)]
|
198
197
|
elif isinstance(df, dict):
|
@@ -310,7 +309,6 @@ def get_pipe_data(
|
|
310
309
|
**kw: Any
|
311
310
|
) -> Union[pandas.DataFrame, None]:
|
312
311
|
"""Fetch data from the API."""
|
313
|
-
import json
|
314
312
|
r_url = pipe_r_url(pipe)
|
315
313
|
chunks_list = []
|
316
314
|
while True:
|
@@ -340,7 +338,7 @@ def get_pipe_data(
|
|
340
338
|
from meerschaum.utils.dataframe import parse_df_datetimes
|
341
339
|
pd = import_pandas()
|
342
340
|
try:
|
343
|
-
df = pd.read_json(response.text)
|
341
|
+
df = pd.read_json(StringIO(response.text))
|
344
342
|
except Exception as e:
|
345
343
|
warn(f"Failed to parse response for {pipe}:\n{e}")
|
346
344
|
return None
|
@@ -367,7 +365,6 @@ def get_backtrack_data(
|
|
367
365
|
**kw: Any,
|
368
366
|
) -> pandas.DataFrame:
|
369
367
|
"""Get a Pipe's backtrack data from the API."""
|
370
|
-
import json
|
371
368
|
r_url = pipe_r_url(pipe)
|
372
369
|
try:
|
373
370
|
response = self.get(
|
@@ -389,12 +386,12 @@ def get_backtrack_data(
|
|
389
386
|
dprint(response.text)
|
390
387
|
pd = import_pandas()
|
391
388
|
try:
|
392
|
-
df = pd.read_json(response.text)
|
389
|
+
df = pd.read_json(StringIO(response.text))
|
393
390
|
except Exception as e:
|
394
391
|
warn(f"Failed to read response into a dataframe:\n{e}")
|
395
392
|
return None
|
396
393
|
|
397
|
-
df = parse_df_datetimes(pd.read_json(response.text), debug=debug)
|
394
|
+
df = parse_df_datetimes(pd.read_json(StringIO(response.text)), debug=debug)
|
398
395
|
return df
|
399
396
|
|
400
397
|
def get_pipe_id(
|
@@ -438,7 +435,6 @@ def get_pipe_attributes(
|
|
438
435
|
"""
|
439
436
|
r_url = pipe_r_url(pipe)
|
440
437
|
response = self.get(r_url + '/attributes', debug=debug)
|
441
|
-
import json
|
442
438
|
try:
|
443
439
|
return json.loads(response.text)
|
444
440
|
except Exception as e:
|
@@ -474,7 +470,6 @@ def get_sync_time(
|
|
474
470
|
"""
|
475
471
|
from meerschaum.utils.misc import is_int
|
476
472
|
from meerschaum.utils.warnings import warn
|
477
|
-
import datetime, json
|
478
473
|
r_url = pipe_r_url(pipe)
|
479
474
|
response = self.get(
|
480
475
|
r_url + '/sync_time',
|
@@ -545,7 +540,6 @@ def create_metadata(
|
|
545
540
|
"""
|
546
541
|
from meerschaum.utils.debug import dprint
|
547
542
|
from meerschaum.config.static import STATIC_CONFIG
|
548
|
-
import json
|
549
543
|
r_url = STATIC_CONFIG['api']['endpoints']['metadata']
|
550
544
|
response = self.post(r_url, debug=debug)
|
551
545
|
if debug:
|
@@ -590,7 +584,6 @@ def get_pipe_rowcount(
|
|
590
584
|
The number of rows in the pipe's table, bound the given parameters.
|
591
585
|
If the table does not exist, return 0.
|
592
586
|
"""
|
593
|
-
import json
|
594
587
|
r_url = pipe_r_url(pipe)
|
595
588
|
response = self.get(
|
596
589
|
r_url + "/rowcount",
|
@@ -148,7 +148,7 @@ def get_pipe_metadef(
|
|
148
148
|
dt_name = sql_item_name(_dt, self.flavor)
|
149
149
|
is_guess = False
|
150
150
|
|
151
|
-
if begin
|
151
|
+
if begin not in (None, '') or end is not None:
|
152
152
|
if is_guess:
|
153
153
|
if _dt is None:
|
154
154
|
warn(
|
@@ -168,20 +168,38 @@ def get_pipe_metadef(
|
|
168
168
|
if 'order by' in definition.lower() and 'over' not in definition.lower():
|
169
169
|
error("Cannot fetch with an ORDER clause in the definition")
|
170
170
|
|
171
|
+
apply_backtrack = begin == ''
|
171
172
|
begin = (
|
172
|
-
|
173
|
-
|
173
|
+
pipe.get_sync_time(debug=debug)
|
174
|
+
if begin == ''
|
175
|
+
else begin
|
174
176
|
)
|
175
|
-
|
177
|
+
|
178
|
+
if begin and end and begin >= end:
|
179
|
+
begin = None
|
180
|
+
|
176
181
|
da = None
|
177
182
|
if dt_name:
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
183
|
+
begin_da = (
|
184
|
+
dateadd_str(
|
185
|
+
flavor = self.flavor,
|
186
|
+
datepart = 'minute',
|
187
|
+
number = ((-1 * btm) if apply_backtrack else 0),
|
188
|
+
begin = begin,
|
189
|
+
)
|
190
|
+
if begin
|
191
|
+
else None
|
192
|
+
)
|
193
|
+
end_da = (
|
194
|
+
dateadd_str(
|
195
|
+
flavor = self.flavor,
|
196
|
+
datepart = 'minute',
|
197
|
+
number = 0,
|
198
|
+
begin = end,
|
199
|
+
)
|
200
|
+
if end
|
201
|
+
else None
|
202
|
+
)
|
185
203
|
|
186
204
|
meta_def = (
|
187
205
|
_simple_fetch_query(pipe) if (
|
@@ -1438,12 +1438,11 @@ def sync_pipe_inplace(
|
|
1438
1438
|
drop_backtrack_query = f"DROP TABLE {backtrack_table_name}"
|
1439
1439
|
if table_exists(backtrack_table_raw, self, debug=debug):
|
1440
1440
|
backtrack_queries.append(drop_backtrack_query)
|
1441
|
-
btm = max(self.get_pipe_backtrack_minutes(pipe), 1)
|
1442
1441
|
backtrack_def = self.get_pipe_data_query(
|
1443
1442
|
pipe,
|
1444
1443
|
begin = begin,
|
1445
1444
|
end = end,
|
1446
|
-
begin_add_minutes =
|
1445
|
+
begin_add_minutes = 0,
|
1447
1446
|
end_add_minutes = 1,
|
1448
1447
|
params = params,
|
1449
1448
|
debug = debug,
|
meerschaum/core/Pipe/__init__.py
CHANGED
@@ -81,7 +81,10 @@ class Pipe:
|
|
81
81
|
```
|
82
82
|
"""
|
83
83
|
|
84
|
-
from ._fetch import
|
84
|
+
from ._fetch import (
|
85
|
+
fetch,
|
86
|
+
get_backtrack_interval,
|
87
|
+
)
|
85
88
|
from ._data import (
|
86
89
|
get_data,
|
87
90
|
get_backtrack_data,
|
@@ -279,15 +282,26 @@ class Pipe:
|
|
279
282
|
|
280
283
|
@property
|
281
284
|
def meta(self):
|
282
|
-
"""
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
285
|
+
"""
|
286
|
+
Return the four keys needed to reconstruct this pipe.
|
287
|
+
"""
|
288
|
+
return {
|
289
|
+
'connector_keys': self.connector_keys,
|
290
|
+
'metric_key' : self.metric_key,
|
291
|
+
'location_key' : self.location_key,
|
292
|
+
'instance' : self.instance_keys,
|
293
|
+
}
|
294
|
+
|
295
|
+
|
296
|
+
def keys(self) -> List[str]:
|
297
|
+
"""
|
298
|
+
Return the ordered keys for this pipe.
|
299
|
+
"""
|
300
|
+
return {
|
301
|
+
key: val
|
302
|
+
for key, val in self.meta.items()
|
303
|
+
if key != 'instance'
|
304
|
+
}
|
291
305
|
|
292
306
|
|
293
307
|
@property
|
@@ -436,3 +450,10 @@ class Pipe:
|
|
436
450
|
metric_key = _state.pop('metric_key')
|
437
451
|
location_key = _state.pop('location_key')
|
438
452
|
self.__init__(connector_keys, metric_key, location_key, **_state)
|
453
|
+
|
454
|
+
|
455
|
+
def __getitem__(self, *args, **kwargs) -> Any:
|
456
|
+
"""
|
457
|
+
Index the pipe's attributes.
|
458
|
+
"""
|
459
|
+
return self.attributes.__getitem__(*args, **kwargs)
|
meerschaum/core/Pipe/_data.py
CHANGED
@@ -325,7 +325,7 @@ def _get_data_as_iterator(
|
|
325
325
|
def get_backtrack_data(
|
326
326
|
self,
|
327
327
|
backtrack_minutes: int = 0,
|
328
|
-
begin:
|
328
|
+
begin: Union[datetime, int, None] = None,
|
329
329
|
params: Optional[Dict[str, Any]] = None,
|
330
330
|
fresh: bool = False,
|
331
331
|
debug: bool = False,
|
@@ -338,7 +338,7 @@ def get_backtrack_data(
|
|
338
338
|
----------
|
339
339
|
backtrack_minutes: int, default 0
|
340
340
|
How many minutes from `begin` to select from.
|
341
|
-
|
341
|
+
If 0 (default), use `pipe.parameters['fetch']['backtrack_minutes']`.
|
342
342
|
|
343
343
|
begin: Optional[datetime], default None
|
344
344
|
The starting point to search for data.
|
@@ -370,7 +370,6 @@ def get_backtrack_data(
|
|
370
370
|
-------
|
371
371
|
A `pd.DataFrame` for the pipe's data corresponding to the provided parameters. Backtrack data
|
372
372
|
is a convenient way to get a pipe's data "backtracked" from the most recent datetime.
|
373
|
-
|
374
373
|
"""
|
375
374
|
from meerschaum.utils.warnings import warn
|
376
375
|
from meerschaum.utils.venv import Venv
|
@@ -379,6 +378,14 @@ def get_backtrack_data(
|
|
379
378
|
if not self.exists(debug=debug):
|
380
379
|
return None
|
381
380
|
|
381
|
+
backtrack_interval = self.get_backtrack_interval(debug=debug)
|
382
|
+
if backtrack_minutes == 0:
|
383
|
+
backtrack_minutes = (
|
384
|
+
(backtrack_interval.total_seconds() * 60)
|
385
|
+
if isinstance(backtrack_interval, timedelta)
|
386
|
+
else backtrack_interval
|
387
|
+
)
|
388
|
+
|
382
389
|
if self.cache_pipe is not None:
|
383
390
|
if not fresh:
|
384
391
|
_sync_cache_tuple = self.cache_pipe.sync(begin=begin, params=params, debug=debug, **kw)
|
@@ -438,7 +445,7 @@ def get_rowcount(
|
|
438
445
|
params: Optional[Dict[str, Any]] = None,
|
439
446
|
remote: bool = False,
|
440
447
|
debug: bool = False
|
441
|
-
) ->
|
448
|
+
) -> int:
|
442
449
|
"""
|
443
450
|
Get a Pipe's instance or remote rowcount.
|
444
451
|
|
@@ -460,8 +467,7 @@ def get_rowcount(
|
|
460
467
|
Returns
|
461
468
|
-------
|
462
469
|
An `int` of the number of rows in the pipe corresponding to the provided parameters.
|
463
|
-
|
464
|
-
|
470
|
+
Returned 0 if the pipe does not exist.
|
465
471
|
"""
|
466
472
|
from meerschaum.utils.warnings import warn
|
467
473
|
from meerschaum.utils.venv import Venv
|
@@ -470,7 +476,7 @@ def get_rowcount(
|
|
470
476
|
connector = self.instance_connector if not remote else self.connector
|
471
477
|
try:
|
472
478
|
with Venv(get_connector_plugin(connector)):
|
473
|
-
|
479
|
+
rowcount = connector.get_pipe_rowcount(
|
474
480
|
self,
|
475
481
|
begin = begin,
|
476
482
|
end = end,
|
@@ -478,12 +484,15 @@ def get_rowcount(
|
|
478
484
|
remote = remote,
|
479
485
|
debug = debug,
|
480
486
|
)
|
487
|
+
if rowcount is None:
|
488
|
+
return 0
|
489
|
+
return rowcount
|
481
490
|
except AttributeError as e:
|
482
491
|
warn(e)
|
483
492
|
if remote:
|
484
|
-
return
|
493
|
+
return 0
|
485
494
|
warn(f"Failed to get a rowcount for {self}.")
|
486
|
-
return
|
495
|
+
return 0
|
487
496
|
|
488
497
|
|
489
498
|
def get_chunk_interval(
|
@@ -505,8 +514,8 @@ def get_chunk_interval(
|
|
505
514
|
-------
|
506
515
|
The chunk interval (`timedelta` or `int`) to use with this pipe's `datetime` axis.
|
507
516
|
"""
|
508
|
-
default_chunk_minutes = get_config('pipes', 'parameters', 'chunk_minutes')
|
509
|
-
configured_chunk_minutes = self.parameters.get('chunk_minutes', None)
|
517
|
+
default_chunk_minutes = get_config('pipes', 'parameters', 'verify', 'chunk_minutes')
|
518
|
+
configured_chunk_minutes = self.parameters.get('verify', {}).get('chunk_minutes', None)
|
510
519
|
chunk_minutes = (
|
511
520
|
(configured_chunk_minutes or default_chunk_minutes)
|
512
521
|
if chunk_interval is None
|
@@ -559,7 +568,8 @@ def get_chunk_bounds(
|
|
559
568
|
|
560
569
|
chunk_interval: Union[timedelta, int, None], default None
|
561
570
|
If provided, use this interval for the size of chunk boundaries.
|
562
|
-
The default value for this pipe may be set
|
571
|
+
The default value for this pipe may be set
|
572
|
+
under `pipe.parameters['verify']['chunk_minutes']`.
|
563
573
|
|
564
574
|
debug: bool, default False
|
565
575
|
Verbosity toggle.
|
@@ -578,7 +588,7 @@ def get_chunk_bounds(
|
|
578
588
|
if begin is None and end is None:
|
579
589
|
return [(None, None)]
|
580
590
|
|
581
|
-
### Set the chunk interval under `pipe.parameters['chunk_minutes']`.
|
591
|
+
### Set the chunk interval under `pipe.parameters['verify']['chunk_minutes']`.
|
582
592
|
chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug)
|
583
593
|
|
584
594
|
### Build a list of tuples containing the chunk boundaries
|
@@ -65,6 +65,7 @@ def deduplicate(
|
|
65
65
|
A `SuccessTuple` corresponding to whether all of the chunks were successfully deduplicated.
|
66
66
|
"""
|
67
67
|
from meerschaum.utils.warnings import warn, info
|
68
|
+
from meerschaum.utils.misc import interval_str, items_str
|
68
69
|
from meerschaum.utils.venv import Venv
|
69
70
|
from meerschaum.connectors import get_connector_plugin
|
70
71
|
from meerschaum.utils.pool import get_pool
|
@@ -74,6 +75,7 @@ def deduplicate(
|
|
74
75
|
begin = begin,
|
75
76
|
end = end,
|
76
77
|
params = params,
|
78
|
+
bounded = bounded,
|
77
79
|
debug = debug,
|
78
80
|
**kwargs
|
79
81
|
)
|
@@ -90,6 +92,7 @@ def deduplicate(
|
|
90
92
|
begin = begin,
|
91
93
|
end = end,
|
92
94
|
params = params,
|
95
|
+
bounded = bounded,
|
93
96
|
debug = debug,
|
94
97
|
**kwargs
|
95
98
|
)
|
@@ -104,8 +107,18 @@ def deduplicate(
|
|
104
107
|
begin = (
|
105
108
|
bound_time
|
106
109
|
if bound_time is not None
|
107
|
-
else self.get_sync_time(debug=debug)
|
110
|
+
else self.get_sync_time(newest=False, debug=debug)
|
108
111
|
)
|
112
|
+
if bounded and end is None:
|
113
|
+
end = self.get_sync_time(newest=True, debug=debug)
|
114
|
+
|
115
|
+
if bounded and end is not None:
|
116
|
+
end += (
|
117
|
+
timedelta(minutes=1)
|
118
|
+
if isinstance(end, datetime)
|
119
|
+
else 1
|
120
|
+
)
|
121
|
+
|
109
122
|
chunk_bounds = self.get_chunk_bounds(
|
110
123
|
bounded = bounded,
|
111
124
|
begin = begin,
|
@@ -115,6 +128,8 @@ def deduplicate(
|
|
115
128
|
)
|
116
129
|
|
117
130
|
indices = [col for col in self.columns.values() if col]
|
131
|
+
if not indices:
|
132
|
+
return False, f"Cannot deduplicate without index columns."
|
118
133
|
dt_col = self.columns.get('datetime', None)
|
119
134
|
|
120
135
|
def process_chunk_bounds(bounds) -> Tuple[
|
@@ -155,7 +170,20 @@ def deduplicate(
|
|
155
170
|
return bounds, (True, f"{chunk_msg_header}\nChunk is empty, skipping...")
|
156
171
|
|
157
172
|
chunk_indices = [ix for ix in indices if ix in full_chunk.columns]
|
158
|
-
|
173
|
+
if not chunk_indices:
|
174
|
+
return bounds, (False, f"None of {items_str(indices)} were present in chunk.")
|
175
|
+
try:
|
176
|
+
full_chunk = full_chunk.drop_duplicates(
|
177
|
+
subset = chunk_indices,
|
178
|
+
keep = 'last'
|
179
|
+
).reset_index(
|
180
|
+
drop = True,
|
181
|
+
)
|
182
|
+
except Exception as e:
|
183
|
+
return (
|
184
|
+
bounds,
|
185
|
+
(False, f"Failed to deduplicate chunk on {items_str(chunk_indices)}:\n({e})")
|
186
|
+
)
|
159
187
|
|
160
188
|
clear_success, clear_msg = self.clear(
|
161
189
|
begin = chunk_begin,
|
@@ -192,19 +220,16 @@ def deduplicate(
|
|
192
220
|
True, (
|
193
221
|
chunk_msg_header + "\n"
|
194
222
|
+ chunk_msg_body + ("\n" if chunk_msg_body else '')
|
195
|
-
+ f"
|
223
|
+
+ f"Deduplicated chunk from {existing_chunk_len} to {chunk_rowcount} rows."
|
196
224
|
)
|
197
225
|
)
|
198
226
|
|
199
|
-
_start = chunk_bounds[0][(0 if bounded else 1)]
|
200
|
-
_end = chunk_bounds[-1][(0 if not bounded else 1)]
|
201
|
-
message_header = f"{_start} - {_end}"
|
202
227
|
info(
|
203
228
|
f"Deduplicating {len(chunk_bounds)} chunk"
|
204
229
|
+ ('s' if len(chunk_bounds) != 1 else '')
|
205
230
|
+ f" ({'un' if not bounded else ''}bounded)"
|
206
|
-
+ f" of size '{chunk_interval}'"
|
207
|
-
+ f"
|
231
|
+
+ f" of size '{interval_str(chunk_interval)}'"
|
232
|
+
+ f" on {self}."
|
208
233
|
)
|
209
234
|
bounds_success_tuples = dict(pool.map(process_chunk_bounds, chunk_bounds))
|
210
235
|
bounds_successes = {
|
@@ -223,11 +248,10 @@ def deduplicate(
|
|
223
248
|
return (
|
224
249
|
False,
|
225
250
|
(
|
226
|
-
|
227
|
-
+ f"Failed to deduplicate {len(bounds_failures)} chunk"
|
251
|
+
f"Failed to deduplicate {len(bounds_failures)} chunk"
|
228
252
|
+ ('s' if len(bounds_failures) != 1 else '')
|
229
|
-
+ "
|
230
|
-
+ "\n".join([msg for _, (_, msg) in bounds_failures.items()])
|
253
|
+
+ ".\n"
|
254
|
+
+ "\n".join([msg for _, (_, msg) in bounds_failures.items() if msg])
|
231
255
|
)
|
232
256
|
)
|
233
257
|
|
@@ -236,11 +260,10 @@ def deduplicate(
|
|
236
260
|
return (
|
237
261
|
True,
|
238
262
|
(
|
239
|
-
|
240
|
-
+ f"Successfully deduplicated {len(bounds_successes)} chunk"
|
263
|
+
f"Successfully deduplicated {len(bounds_successes)} chunk"
|
241
264
|
+ ('s' if len(bounds_successes) != 1 else '')
|
242
265
|
+ ".\n"
|
243
|
-
+ "\n".join([msg for _, (_, msg) in bounds_successes.items()])
|
266
|
+
+ "\n".join([msg for _, (_, msg) in bounds_successes.items() if msg])
|
244
267
|
).rstrip('\n')
|
245
268
|
)
|
246
269
|
|
@@ -262,21 +285,19 @@ def deduplicate(
|
|
262
285
|
return (
|
263
286
|
True,
|
264
287
|
(
|
265
|
-
|
266
|
-
+ f"Successfully deduplicated {len(bounds_successes)} chunk"
|
288
|
+
f"Successfully deduplicated {len(bounds_successes)} chunk"
|
267
289
|
+ ('s' if len(bounds_successes) != 1 else '')
|
268
|
-
+ f"
|
269
|
-
+ "\n".join([msg for _, (_, msg) in bounds_successes.items()])
|
290
|
+
+ f"({len(retry_bounds_successes)} retried):\n"
|
291
|
+
+ "\n".join([msg for _, (_, msg) in bounds_successes.items() if msg])
|
270
292
|
).rstrip('\n')
|
271
293
|
)
|
272
294
|
|
273
295
|
return (
|
274
296
|
False,
|
275
297
|
(
|
276
|
-
|
277
|
-
+ f"Failed to deduplicate {len(bounds_failures)} chunk"
|
298
|
+
f"Failed to deduplicate {len(bounds_failures)} chunk"
|
278
299
|
+ ('s' if len(retry_bounds_failures) != 1 else '')
|
279
|
-
+ "
|
280
|
-
+ "\n".join([msg for _, (_, msg) in retry_bounds_failures.items()])
|
300
|
+
+ ".\n"
|
301
|
+
+ "\n".join([msg for _, (_, msg) in retry_bounds_failures.items() if msg])
|
281
302
|
).rstrip('\n')
|
282
303
|
)
|
meerschaum/core/Pipe/_dtypes.py
CHANGED
@@ -7,6 +7,7 @@ Enforce data types for a pipe's underlying table.
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
|
+
from io import StringIO
|
10
11
|
from meerschaum.utils.typing import Dict, Any, Optional
|
11
12
|
|
12
13
|
def enforce_dtypes(
|
@@ -38,7 +39,7 @@ def enforce_dtypes(
|
|
38
39
|
try:
|
39
40
|
if isinstance(df, str):
|
40
41
|
df = parse_df_datetimes(
|
41
|
-
pd.read_json(df),
|
42
|
+
pd.read_json(StringIO(df)),
|
42
43
|
ignore_cols = [
|
43
44
|
col
|
44
45
|
for col, dtype in pipe_dtypes.items()
|
meerschaum/core/Pipe/_fetch.py
CHANGED
@@ -7,7 +7,9 @@ Functions for fetching new data into the Pipe
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
|
+
from datetime import timedelta
|
10
11
|
from meerschaum.utils.typing import Optional, Any
|
12
|
+
from meerschaum.config import get_config
|
11
13
|
|
12
14
|
def fetch(
|
13
15
|
self,
|
@@ -98,3 +100,30 @@ def fetch(
|
|
98
100
|
connector_plugin.deactivate_venv(debug=debug)
|
99
101
|
|
100
102
|
return df
|
103
|
+
|
104
|
+
|
105
|
+
def get_backtrack_interval(self, debug: bool = False) -> Union[timedelta, int]:
|
106
|
+
"""
|
107
|
+
Get the chunk interval to use for this pipe.
|
108
|
+
|
109
|
+
Returns
|
110
|
+
-------
|
111
|
+
The backtrack interval (`timedelta` or `int`) to use with this pipe's `datetime` axis.
|
112
|
+
"""
|
113
|
+
default_backtrack_minutes = get_config('pipes', 'parameters', 'fetch', 'backtrack_minutes')
|
114
|
+
configured_backtrack_minutes = self.parameters.get('fetch', {}).get('backtrack_minutes', None)
|
115
|
+
backtrack_minutes = (
|
116
|
+
configured_backtrack_minutes
|
117
|
+
if configured_backtrack_minutes is not None
|
118
|
+
else default_backtrack_minutes
|
119
|
+
)
|
120
|
+
|
121
|
+
dt_col = self.columns.get('datetime', None)
|
122
|
+
if dt_col is None:
|
123
|
+
return timedelta(minutes=backtrack_minutes)
|
124
|
+
|
125
|
+
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
|
126
|
+
if 'datetime' in dt_dtype.lower():
|
127
|
+
return timedelta(minutes=backtrack_minutes)
|
128
|
+
|
129
|
+
return backtrack_minutes
|