meerschaum 2.0.0rc7__py3-none-any.whl → 2.0.0rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. meerschaum/actions/__init__.py +97 -48
  2. meerschaum/actions/bootstrap.py +1 -1
  3. meerschaum/actions/clear.py +1 -1
  4. meerschaum/actions/deduplicate.py +1 -1
  5. meerschaum/actions/delete.py +8 -7
  6. meerschaum/actions/drop.py +1 -10
  7. meerschaum/actions/edit.py +1 -1
  8. meerschaum/actions/install.py +1 -1
  9. meerschaum/actions/pause.py +1 -1
  10. meerschaum/actions/register.py +1 -1
  11. meerschaum/actions/setup.py +1 -1
  12. meerschaum/actions/show.py +1 -1
  13. meerschaum/actions/start.py +18 -7
  14. meerschaum/actions/stop.py +5 -4
  15. meerschaum/actions/sync.py +3 -1
  16. meerschaum/actions/uninstall.py +1 -1
  17. meerschaum/actions/upgrade.py +1 -1
  18. meerschaum/actions/verify.py +54 -3
  19. meerschaum/config/_default.py +1 -1
  20. meerschaum/config/_formatting.py +26 -0
  21. meerschaum/config/_jobs.py +28 -5
  22. meerschaum/config/_paths.py +21 -5
  23. meerschaum/config/_version.py +1 -1
  24. meerschaum/connectors/api/_fetch.py +40 -38
  25. meerschaum/connectors/api/_pipes.py +10 -17
  26. meerschaum/connectors/sql/_fetch.py +29 -11
  27. meerschaum/connectors/sql/_pipes.py +1 -2
  28. meerschaum/core/Pipe/__init__.py +31 -10
  29. meerschaum/core/Pipe/_data.py +23 -13
  30. meerschaum/core/Pipe/_deduplicate.py +44 -23
  31. meerschaum/core/Pipe/_dtypes.py +2 -1
  32. meerschaum/core/Pipe/_fetch.py +29 -0
  33. meerschaum/core/Pipe/_sync.py +25 -18
  34. meerschaum/core/Pipe/_verify.py +60 -25
  35. meerschaum/plugins/__init__.py +3 -0
  36. meerschaum/utils/daemon/Daemon.py +108 -27
  37. meerschaum/utils/daemon/__init__.py +35 -1
  38. meerschaum/utils/dataframe.py +2 -0
  39. meerschaum/utils/formatting/__init__.py +144 -1
  40. meerschaum/utils/formatting/_pipes.py +28 -5
  41. meerschaum/utils/misc.py +184 -188
  42. meerschaum/utils/packages/__init__.py +1 -1
  43. meerschaum/utils/packages/_packages.py +1 -0
  44. {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/METADATA +4 -1
  45. {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/RECORD +51 -51
  46. {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/LICENSE +0 -0
  47. {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/NOTICE +0 -0
  48. {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/WHEEL +0 -0
  49. {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/entry_points.txt +0 -0
  50. {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/top_level.txt +0 -0
  51. {meerschaum-2.0.0rc7.dist-info → meerschaum-2.0.0rc9.dist-info}/zip-safe +0 -0
@@ -7,60 +7,62 @@ Fetch Pipe data via the API connector
7
7
  """
8
8
 
9
9
  from __future__ import annotations
10
- import datetime
10
+ from datetime import datetime
11
11
  import copy
12
- from meerschaum.utils.typing import Any, Optional, Dict
12
+ import meerschaum as mrsm
13
+ from meerschaum.utils.typing import Any, Optional, Dict, Iterator, Union
13
14
 
14
15
  def fetch(
15
16
  self,
16
- pipe: meerschaum.Pipe,
17
- begin: Optional[datetime.datetime, str] = '',
18
- end: Optional[datetime.datetime] = None,
17
+ pipe: mrsm.Pipe,
18
+ begin: Union[datetime, str, int] = '',
19
+ end: Union[datetime, int] = None,
19
20
  params: Optional[Dict, Any] = None,
20
21
  debug: bool = False,
21
22
  **kw: Any
22
- ) -> pandas.DataFrame:
23
+ ) -> Iterator['pd.DataFrame']:
23
24
  """Get the Pipe data from the remote Pipe."""
24
25
  from meerschaum.utils.debug import dprint
25
26
  from meerschaum.utils.warnings import warn, error
26
- from meerschaum.config.static import _static_config
27
27
  from meerschaum.config._patch import apply_patch_to_config
28
28
 
29
- if 'fetch' not in pipe.parameters:
30
- warn(f"Missing 'fetch' parameters for Pipe '{pipe}'.", stack=False)
29
+ fetch_params = pipe.parameters.get('fetch', {})
30
+ if not fetch_params:
31
+ warn(f"Missing 'fetch' parameters for {pipe}.", stack=False)
31
32
  return None
32
33
 
33
- instructions = pipe.parameters['fetch']
34
+ pipe_meta = fetch_params.get('pipe', {})
35
+ ### Legacy: check for `connector_keys`, etc. at the root.
36
+ if not pipe_meta:
37
+ ck, mk, lk = (
38
+ fetch_params.get('connector_keys', None),
39
+ fetch_params.get('metric_key', None),
40
+ fetch_params.get('location_key', None),
41
+ )
42
+ if not ck or not mk:
43
+ warn(f"Missing `fetch:pipe` keys for {pipe}.", stack=False)
44
+ return None
34
45
 
35
- if 'connector_keys' not in instructions:
36
- warn(f"Missing connector_keys in fetch parameters for Pipe '{pipe}'", stack=False)
37
- return None
38
- remote_connector_keys = instructions.get('connector_keys', None)
39
- if 'metric_key' not in instructions:
40
- warn(f"Missing metric_key in fetch parameters for Pipe '{pipe}'", stack=False)
41
- return None
42
- remote_metric_key = instructions.get('metric_key', None)
43
- remote_location_key = instructions.get('location_key', None)
44
- if begin is None:
45
- begin = pipe.sync_time
46
+ pipe_meta.update({
47
+ 'connector': ck,
48
+ 'metric': mk,
49
+ 'location': lk,
50
+ })
46
51
 
47
- _params = copy.deepcopy(params) if params is not None else {}
48
- _params = apply_patch_to_config(_params, instructions.get('params', {}))
52
+ pipe_meta['instance'] = self
53
+ source_pipe = mrsm.Pipe(**pipe_meta)
49
54
 
50
- from meerschaum import Pipe
51
- p = Pipe(
52
- remote_connector_keys,
53
- remote_metric_key,
54
- remote_location_key,
55
- mrsm_instance = self
56
- )
57
- begin = (
58
- begin if not (isinstance(begin, str) and begin == '')
59
- else pipe.get_sync_time(debug=debug)
60
- )
55
+ _params = copy.deepcopy(params) if params is not None else {}
56
+ _params = apply_patch_to_config(_params, fetch_params.get('params', {}))
57
+ select_columns = fetch_params.get('select_columns', [])
58
+ omit_columns = fetch_params.get('omit_columns', [])
61
59
 
62
- return p.get_data(
63
- begin=begin, end=end,
64
- params=_params,
65
- debug=debug
60
+ return source_pipe.get_data(
61
+ select_columns = select_columns,
62
+ omit_columns = omit_columns,
63
+ begin = begin,
64
+ end = end,
65
+ params = _params,
66
+ debug = debug,
67
+ as_iterator = True,
66
68
  )
@@ -7,6 +7,9 @@ Register or fetch Pipes from the API
7
7
  """
8
8
 
9
9
  from __future__ import annotations
10
+ import time
11
+ import json
12
+ from io import StringIO
10
13
  from datetime import datetime
11
14
  from meerschaum.utils.debug import dprint
12
15
  from meerschaum.utils.warnings import warn, error
@@ -123,7 +126,6 @@ def fetch_pipes_keys(
123
126
  A list of tuples containing pipes' keys.
124
127
  """
125
128
  from meerschaum.config.static import STATIC_CONFIG
126
- import json
127
129
  if connector_keys is None:
128
130
  connector_keys = []
129
131
  if metric_keys is None:
@@ -162,14 +164,11 @@ def sync_pipe(
162
164
  debug: bool = False,
163
165
  **kw: Any
164
166
  ) -> SuccessTuple:
165
- """Append a pandas DataFrame to a Pipe.
166
- If Pipe does not exist, it is registered with supplied metadata.
167
- """
167
+ """Sync a DataFrame into a Pipe."""
168
168
  from meerschaum.utils.debug import dprint
169
169
  from meerschaum.utils.misc import json_serialize_datetime
170
170
  from meerschaum.config import get_config
171
171
  from meerschaum.utils.packages import attempt_import
172
- import json, time
173
172
  begin = time.time()
174
173
  more_itertools = attempt_import('more_itertools')
175
174
  if df is None:
@@ -185,14 +184,14 @@ def sync_pipe(
185
184
 
186
185
  df = json.loads(df) if isinstance(df, str) else df
187
186
 
188
- ### TODO Make separate chunksize for API?
189
- _chunksize : Optional[int] = (1 if chunksize is None else (
187
+ _chunksize: Optional[int] = (1 if chunksize is None else (
190
188
  get_config('system', 'connectors', 'sql', 'chunksize') if chunksize == -1
191
189
  else chunksize
192
190
  ))
193
- keys : list = list(df.keys())
191
+ keys: list = list(df.keys())
194
192
  chunks = []
195
193
  if hasattr(df, 'index'):
194
+ df = df.reset_index(drop=True)
196
195
  rowcount = len(df)
197
196
  chunks = [df.iloc[i] for i in more_itertools.chunked(df.index, _chunksize)]
198
197
  elif isinstance(df, dict):
@@ -310,7 +309,6 @@ def get_pipe_data(
310
309
  **kw: Any
311
310
  ) -> Union[pandas.DataFrame, None]:
312
311
  """Fetch data from the API."""
313
- import json
314
312
  r_url = pipe_r_url(pipe)
315
313
  chunks_list = []
316
314
  while True:
@@ -340,7 +338,7 @@ def get_pipe_data(
340
338
  from meerschaum.utils.dataframe import parse_df_datetimes
341
339
  pd = import_pandas()
342
340
  try:
343
- df = pd.read_json(response.text)
341
+ df = pd.read_json(StringIO(response.text))
344
342
  except Exception as e:
345
343
  warn(f"Failed to parse response for {pipe}:\n{e}")
346
344
  return None
@@ -367,7 +365,6 @@ def get_backtrack_data(
367
365
  **kw: Any,
368
366
  ) -> pandas.DataFrame:
369
367
  """Get a Pipe's backtrack data from the API."""
370
- import json
371
368
  r_url = pipe_r_url(pipe)
372
369
  try:
373
370
  response = self.get(
@@ -389,12 +386,12 @@ def get_backtrack_data(
389
386
  dprint(response.text)
390
387
  pd = import_pandas()
391
388
  try:
392
- df = pd.read_json(response.text)
389
+ df = pd.read_json(StringIO(response.text))
393
390
  except Exception as e:
394
391
  warn(f"Failed to read response into a dataframe:\n{e}")
395
392
  return None
396
393
 
397
- df = parse_df_datetimes(pd.read_json(response.text), debug=debug)
394
+ df = parse_df_datetimes(pd.read_json(StringIO(response.text)), debug=debug)
398
395
  return df
399
396
 
400
397
  def get_pipe_id(
@@ -438,7 +435,6 @@ def get_pipe_attributes(
438
435
  """
439
436
  r_url = pipe_r_url(pipe)
440
437
  response = self.get(r_url + '/attributes', debug=debug)
441
- import json
442
438
  try:
443
439
  return json.loads(response.text)
444
440
  except Exception as e:
@@ -474,7 +470,6 @@ def get_sync_time(
474
470
  """
475
471
  from meerschaum.utils.misc import is_int
476
472
  from meerschaum.utils.warnings import warn
477
- import datetime, json
478
473
  r_url = pipe_r_url(pipe)
479
474
  response = self.get(
480
475
  r_url + '/sync_time',
@@ -545,7 +540,6 @@ def create_metadata(
545
540
  """
546
541
  from meerschaum.utils.debug import dprint
547
542
  from meerschaum.config.static import STATIC_CONFIG
548
- import json
549
543
  r_url = STATIC_CONFIG['api']['endpoints']['metadata']
550
544
  response = self.post(r_url, debug=debug)
551
545
  if debug:
@@ -590,7 +584,6 @@ def get_pipe_rowcount(
590
584
  The number of rows in the pipe's table, bound the given parameters.
591
585
  If the table does not exist, return 0.
592
586
  """
593
- import json
594
587
  r_url = pipe_r_url(pipe)
595
588
  response = self.get(
596
589
  r_url + "/rowcount",
@@ -148,7 +148,7 @@ def get_pipe_metadef(
148
148
  dt_name = sql_item_name(_dt, self.flavor)
149
149
  is_guess = False
150
150
 
151
- if begin is not None or end is not None:
151
+ if begin not in (None, '') or end is not None:
152
152
  if is_guess:
153
153
  if _dt is None:
154
154
  warn(
@@ -168,20 +168,38 @@ def get_pipe_metadef(
168
168
  if 'order by' in definition.lower() and 'over' not in definition.lower():
169
169
  error("Cannot fetch with an ORDER clause in the definition")
170
170
 
171
+ apply_backtrack = begin == ''
171
172
  begin = (
172
- begin if not (isinstance(begin, str) and begin == '')
173
- else pipe.get_sync_time(debug=debug)
173
+ pipe.get_sync_time(debug=debug)
174
+ if begin == ''
175
+ else begin
174
176
  )
175
-
177
+
178
+ if begin and end and begin >= end:
179
+ begin = None
180
+
176
181
  da = None
177
182
  if dt_name:
178
- ### default: do not backtrack
179
- begin_da = dateadd_str(
180
- flavor=self.flavor, datepart='minute', number=(-1 * btm), begin=begin,
181
- ) if begin else None
182
- end_da = dateadd_str(
183
- flavor=self.flavor, datepart='minute', number=1, begin=end,
184
- ) if end else None
183
+ begin_da = (
184
+ dateadd_str(
185
+ flavor = self.flavor,
186
+ datepart = 'minute',
187
+ number = ((-1 * btm) if apply_backtrack else 0),
188
+ begin = begin,
189
+ )
190
+ if begin
191
+ else None
192
+ )
193
+ end_da = (
194
+ dateadd_str(
195
+ flavor = self.flavor,
196
+ datepart = 'minute',
197
+ number = 0,
198
+ begin = end,
199
+ )
200
+ if end
201
+ else None
202
+ )
185
203
 
186
204
  meta_def = (
187
205
  _simple_fetch_query(pipe) if (
@@ -1438,12 +1438,11 @@ def sync_pipe_inplace(
1438
1438
  drop_backtrack_query = f"DROP TABLE {backtrack_table_name}"
1439
1439
  if table_exists(backtrack_table_raw, self, debug=debug):
1440
1440
  backtrack_queries.append(drop_backtrack_query)
1441
- btm = max(self.get_pipe_backtrack_minutes(pipe), 1)
1442
1441
  backtrack_def = self.get_pipe_data_query(
1443
1442
  pipe,
1444
1443
  begin = begin,
1445
1444
  end = end,
1446
- begin_add_minutes = (-1 * btm),
1445
+ begin_add_minutes = 0,
1447
1446
  end_add_minutes = 1,
1448
1447
  params = params,
1449
1448
  debug = debug,
@@ -81,7 +81,10 @@ class Pipe:
81
81
  ```
82
82
  """
83
83
 
84
- from ._fetch import fetch
84
+ from ._fetch import (
85
+ fetch,
86
+ get_backtrack_interval,
87
+ )
85
88
  from ._data import (
86
89
  get_data,
87
90
  get_backtrack_data,
@@ -279,15 +282,26 @@ class Pipe:
279
282
 
280
283
  @property
281
284
  def meta(self):
282
- """Simulate the MetaPipe model without importing FastAPI."""
283
- if '_meta' not in self.__dict__:
284
- self._meta = {
285
- 'connector_keys' : self.connector_keys,
286
- 'metric_key' : self.metric_key,
287
- 'location_key' : self.location_key,
288
- 'instance' : self.instance_keys,
289
- }
290
- return self._meta
285
+ """
286
+ Return the four keys needed to reconstruct this pipe.
287
+ """
288
+ return {
289
+ 'connector_keys': self.connector_keys,
290
+ 'metric_key' : self.metric_key,
291
+ 'location_key' : self.location_key,
292
+ 'instance' : self.instance_keys,
293
+ }
294
+
295
+
296
+ def keys(self) -> List[str]:
297
+ """
298
+ Return the ordered keys for this pipe.
299
+ """
300
+ return {
301
+ key: val
302
+ for key, val in self.meta.items()
303
+ if key != 'instance'
304
+ }
291
305
 
292
306
 
293
307
  @property
@@ -436,3 +450,10 @@ class Pipe:
436
450
  metric_key = _state.pop('metric_key')
437
451
  location_key = _state.pop('location_key')
438
452
  self.__init__(connector_keys, metric_key, location_key, **_state)
453
+
454
+
455
+ def __getitem__(self, *args, **kwargs) -> Any:
456
+ """
457
+ Index the pipe's attributes.
458
+ """
459
+ return self.attributes.__getitem__(*args, **kwargs)
@@ -325,7 +325,7 @@ def _get_data_as_iterator(
325
325
  def get_backtrack_data(
326
326
  self,
327
327
  backtrack_minutes: int = 0,
328
- begin: Optional[datetime] = None,
328
+ begin: Union[datetime, int, None] = None,
329
329
  params: Optional[Dict[str, Any]] = None,
330
330
  fresh: bool = False,
331
331
  debug: bool = False,
@@ -338,7 +338,7 @@ def get_backtrack_data(
338
338
  ----------
339
339
  backtrack_minutes: int, default 0
340
340
  How many minutes from `begin` to select from.
341
- Defaults to 0. This may return a few rows due to a rounding quirk.
341
+ If 0 (default), use `pipe.parameters['fetch']['backtrack_minutes']`.
342
342
 
343
343
  begin: Optional[datetime], default None
344
344
  The starting point to search for data.
@@ -370,7 +370,6 @@ def get_backtrack_data(
370
370
  -------
371
371
  A `pd.DataFrame` for the pipe's data corresponding to the provided parameters. Backtrack data
372
372
  is a convenient way to get a pipe's data "backtracked" from the most recent datetime.
373
-
374
373
  """
375
374
  from meerschaum.utils.warnings import warn
376
375
  from meerschaum.utils.venv import Venv
@@ -379,6 +378,14 @@ def get_backtrack_data(
379
378
  if not self.exists(debug=debug):
380
379
  return None
381
380
 
381
+ backtrack_interval = self.get_backtrack_interval(debug=debug)
382
+ if backtrack_minutes == 0:
383
+ backtrack_minutes = (
384
+ (backtrack_interval.total_seconds() * 60)
385
+ if isinstance(backtrack_interval, timedelta)
386
+ else backtrack_interval
387
+ )
388
+
382
389
  if self.cache_pipe is not None:
383
390
  if not fresh:
384
391
  _sync_cache_tuple = self.cache_pipe.sync(begin=begin, params=params, debug=debug, **kw)
@@ -438,7 +445,7 @@ def get_rowcount(
438
445
  params: Optional[Dict[str, Any]] = None,
439
446
  remote: bool = False,
440
447
  debug: bool = False
441
- ) -> Union[int, None]:
448
+ ) -> int:
442
449
  """
443
450
  Get a Pipe's instance or remote rowcount.
444
451
 
@@ -460,8 +467,7 @@ def get_rowcount(
460
467
  Returns
461
468
  -------
462
469
  An `int` of the number of rows in the pipe corresponding to the provided parameters.
463
- `None` is returned if the pipe does not exist.
464
-
470
+ Returned 0 if the pipe does not exist.
465
471
  """
466
472
  from meerschaum.utils.warnings import warn
467
473
  from meerschaum.utils.venv import Venv
@@ -470,7 +476,7 @@ def get_rowcount(
470
476
  connector = self.instance_connector if not remote else self.connector
471
477
  try:
472
478
  with Venv(get_connector_plugin(connector)):
473
- return connector.get_pipe_rowcount(
479
+ rowcount = connector.get_pipe_rowcount(
474
480
  self,
475
481
  begin = begin,
476
482
  end = end,
@@ -478,12 +484,15 @@ def get_rowcount(
478
484
  remote = remote,
479
485
  debug = debug,
480
486
  )
487
+ if rowcount is None:
488
+ return 0
489
+ return rowcount
481
490
  except AttributeError as e:
482
491
  warn(e)
483
492
  if remote:
484
- return None
493
+ return 0
485
494
  warn(f"Failed to get a rowcount for {self}.")
486
- return None
495
+ return 0
487
496
 
488
497
 
489
498
  def get_chunk_interval(
@@ -505,8 +514,8 @@ def get_chunk_interval(
505
514
  -------
506
515
  The chunk interval (`timedelta` or `int`) to use with this pipe's `datetime` axis.
507
516
  """
508
- default_chunk_minutes = get_config('pipes', 'parameters', 'chunk_minutes')
509
- configured_chunk_minutes = self.parameters.get('chunk_minutes', None)
517
+ default_chunk_minutes = get_config('pipes', 'parameters', 'verify', 'chunk_minutes')
518
+ configured_chunk_minutes = self.parameters.get('verify', {}).get('chunk_minutes', None)
510
519
  chunk_minutes = (
511
520
  (configured_chunk_minutes or default_chunk_minutes)
512
521
  if chunk_interval is None
@@ -559,7 +568,8 @@ def get_chunk_bounds(
559
568
 
560
569
  chunk_interval: Union[timedelta, int, None], default None
561
570
  If provided, use this interval for the size of chunk boundaries.
562
- The default value for this pipe may be set under `pipe.parameters['chunk_minutes']`.
571
+ The default value for this pipe may be set
572
+ under `pipe.parameters['verify']['chunk_minutes']`.
563
573
 
564
574
  debug: bool, default False
565
575
  Verbosity toggle.
@@ -578,7 +588,7 @@ def get_chunk_bounds(
578
588
  if begin is None and end is None:
579
589
  return [(None, None)]
580
590
 
581
- ### Set the chunk interval under `pipe.parameters['chunk_minutes']`.
591
+ ### Set the chunk interval under `pipe.parameters['verify']['chunk_minutes']`.
582
592
  chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug)
583
593
 
584
594
  ### Build a list of tuples containing the chunk boundaries
@@ -65,6 +65,7 @@ def deduplicate(
65
65
  A `SuccessTuple` corresponding to whether all of the chunks were successfully deduplicated.
66
66
  """
67
67
  from meerschaum.utils.warnings import warn, info
68
+ from meerschaum.utils.misc import interval_str, items_str
68
69
  from meerschaum.utils.venv import Venv
69
70
  from meerschaum.connectors import get_connector_plugin
70
71
  from meerschaum.utils.pool import get_pool
@@ -74,6 +75,7 @@ def deduplicate(
74
75
  begin = begin,
75
76
  end = end,
76
77
  params = params,
78
+ bounded = bounded,
77
79
  debug = debug,
78
80
  **kwargs
79
81
  )
@@ -90,6 +92,7 @@ def deduplicate(
90
92
  begin = begin,
91
93
  end = end,
92
94
  params = params,
95
+ bounded = bounded,
93
96
  debug = debug,
94
97
  **kwargs
95
98
  )
@@ -104,8 +107,18 @@ def deduplicate(
104
107
  begin = (
105
108
  bound_time
106
109
  if bound_time is not None
107
- else self.get_sync_time(debug=debug)
110
+ else self.get_sync_time(newest=False, debug=debug)
108
111
  )
112
+ if bounded and end is None:
113
+ end = self.get_sync_time(newest=True, debug=debug)
114
+
115
+ if bounded and end is not None:
116
+ end += (
117
+ timedelta(minutes=1)
118
+ if isinstance(end, datetime)
119
+ else 1
120
+ )
121
+
109
122
  chunk_bounds = self.get_chunk_bounds(
110
123
  bounded = bounded,
111
124
  begin = begin,
@@ -115,6 +128,8 @@ def deduplicate(
115
128
  )
116
129
 
117
130
  indices = [col for col in self.columns.values() if col]
131
+ if not indices:
132
+ return False, f"Cannot deduplicate without index columns."
118
133
  dt_col = self.columns.get('datetime', None)
119
134
 
120
135
  def process_chunk_bounds(bounds) -> Tuple[
@@ -155,7 +170,20 @@ def deduplicate(
155
170
  return bounds, (True, f"{chunk_msg_header}\nChunk is empty, skipping...")
156
171
 
157
172
  chunk_indices = [ix for ix in indices if ix in full_chunk.columns]
158
- full_chunk = full_chunk.drop_duplicates(subset=chunk_indices, keep='last')
173
+ if not chunk_indices:
174
+ return bounds, (False, f"None of {items_str(indices)} were present in chunk.")
175
+ try:
176
+ full_chunk = full_chunk.drop_duplicates(
177
+ subset = chunk_indices,
178
+ keep = 'last'
179
+ ).reset_index(
180
+ drop = True,
181
+ )
182
+ except Exception as e:
183
+ return (
184
+ bounds,
185
+ (False, f"Failed to deduplicate chunk on {items_str(chunk_indices)}:\n({e})")
186
+ )
159
187
 
160
188
  clear_success, clear_msg = self.clear(
161
189
  begin = chunk_begin,
@@ -192,19 +220,16 @@ def deduplicate(
192
220
  True, (
193
221
  chunk_msg_header + "\n"
194
222
  + chunk_msg_body + ("\n" if chunk_msg_body else '')
195
- + f"Chunk succesfully deduplicated to {chunk_rowcount} rows."
223
+ + f"Deduplicated chunk from {existing_chunk_len} to {chunk_rowcount} rows."
196
224
  )
197
225
  )
198
226
 
199
- _start = chunk_bounds[0][(0 if bounded else 1)]
200
- _end = chunk_bounds[-1][(0 if not bounded else 1)]
201
- message_header = f"{_start} - {_end}"
202
227
  info(
203
228
  f"Deduplicating {len(chunk_bounds)} chunk"
204
229
  + ('s' if len(chunk_bounds) != 1 else '')
205
230
  + f" ({'un' if not bounded else ''}bounded)"
206
- + f" of size '{chunk_interval}'"
207
- + f" from '{_start}' to '{_end}'..."
231
+ + f" of size '{interval_str(chunk_interval)}'"
232
+ + f" on {self}."
208
233
  )
209
234
  bounds_success_tuples = dict(pool.map(process_chunk_bounds, chunk_bounds))
210
235
  bounds_successes = {
@@ -223,11 +248,10 @@ def deduplicate(
223
248
  return (
224
249
  False,
225
250
  (
226
- message_header + "\n"
227
- + f"Failed to deduplicate {len(bounds_failures)} chunk"
251
+ f"Failed to deduplicate {len(bounds_failures)} chunk"
228
252
  + ('s' if len(bounds_failures) != 1 else '')
229
- + ":\n"
230
- + "\n".join([msg for _, (_, msg) in bounds_failures.items()])
253
+ + ".\n"
254
+ + "\n".join([msg for _, (_, msg) in bounds_failures.items() if msg])
231
255
  )
232
256
  )
233
257
 
@@ -236,11 +260,10 @@ def deduplicate(
236
260
  return (
237
261
  True,
238
262
  (
239
- message_header + "\n"
240
- + f"Successfully deduplicated {len(bounds_successes)} chunk"
263
+ f"Successfully deduplicated {len(bounds_successes)} chunk"
241
264
  + ('s' if len(bounds_successes) != 1 else '')
242
265
  + ".\n"
243
- + "\n".join([msg for _, (_, msg) in bounds_successes.items()])
266
+ + "\n".join([msg for _, (_, msg) in bounds_successes.items() if msg])
244
267
  ).rstrip('\n')
245
268
  )
246
269
 
@@ -262,21 +285,19 @@ def deduplicate(
262
285
  return (
263
286
  True,
264
287
  (
265
- message_header + "\n"
266
- + f"Successfully deduplicated {len(bounds_successes)} chunk"
288
+ f"Successfully deduplicated {len(bounds_successes)} chunk"
267
289
  + ('s' if len(bounds_successes) != 1 else '')
268
- + f" ({len(retry_bounds_successes)} retried):\n"
269
- + "\n".join([msg for _, (_, msg) in bounds_successes.items()])
290
+ + f"({len(retry_bounds_successes)} retried):\n"
291
+ + "\n".join([msg for _, (_, msg) in bounds_successes.items() if msg])
270
292
  ).rstrip('\n')
271
293
  )
272
294
 
273
295
  return (
274
296
  False,
275
297
  (
276
- message_header + "\n"
277
- + f"Failed to deduplicate {len(bounds_failures)} chunk"
298
+ f"Failed to deduplicate {len(bounds_failures)} chunk"
278
299
  + ('s' if len(retry_bounds_failures) != 1 else '')
279
- + ":\n"
280
- + "\n".join([msg for _, (_, msg) in retry_bounds_failures.items()])
300
+ + ".\n"
301
+ + "\n".join([msg for _, (_, msg) in retry_bounds_failures.items() if msg])
281
302
  ).rstrip('\n')
282
303
  )
@@ -7,6 +7,7 @@ Enforce data types for a pipe's underlying table.
7
7
  """
8
8
 
9
9
  from __future__ import annotations
10
+ from io import StringIO
10
11
  from meerschaum.utils.typing import Dict, Any, Optional
11
12
 
12
13
  def enforce_dtypes(
@@ -38,7 +39,7 @@ def enforce_dtypes(
38
39
  try:
39
40
  if isinstance(df, str):
40
41
  df = parse_df_datetimes(
41
- pd.read_json(df),
42
+ pd.read_json(StringIO(df)),
42
43
  ignore_cols = [
43
44
  col
44
45
  for col, dtype in pipe_dtypes.items()
@@ -7,7 +7,9 @@ Functions for fetching new data into the Pipe
7
7
  """
8
8
 
9
9
  from __future__ import annotations
10
+ from datetime import timedelta
10
11
  from meerschaum.utils.typing import Optional, Any
12
+ from meerschaum.config import get_config
11
13
 
12
14
  def fetch(
13
15
  self,
@@ -98,3 +100,30 @@ def fetch(
98
100
  connector_plugin.deactivate_venv(debug=debug)
99
101
 
100
102
  return df
103
+
104
+
105
+ def get_backtrack_interval(self, debug: bool = False) -> Union[timedelta, int]:
106
+ """
107
+ Get the chunk interval to use for this pipe.
108
+
109
+ Returns
110
+ -------
111
+ The backtrack interval (`timedelta` or `int`) to use with this pipe's `datetime` axis.
112
+ """
113
+ default_backtrack_minutes = get_config('pipes', 'parameters', 'fetch', 'backtrack_minutes')
114
+ configured_backtrack_minutes = self.parameters.get('fetch', {}).get('backtrack_minutes', None)
115
+ backtrack_minutes = (
116
+ configured_backtrack_minutes
117
+ if configured_backtrack_minutes is not None
118
+ else default_backtrack_minutes
119
+ )
120
+
121
+ dt_col = self.columns.get('datetime', None)
122
+ if dt_col is None:
123
+ return timedelta(minutes=backtrack_minutes)
124
+
125
+ dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
126
+ if 'datetime' in dt_dtype.lower():
127
+ return timedelta(minutes=backtrack_minutes)
128
+
129
+ return backtrack_minutes