meerschaum 2.1.6__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. meerschaum/__main__.py +1 -1
  2. meerschaum/_internal/arguments/_parser.py +3 -0
  3. meerschaum/_internal/entry.py +3 -2
  4. meerschaum/_internal/shell/Shell.py +1 -6
  5. meerschaum/actions/api.py +1 -1
  6. meerschaum/actions/install.py +7 -3
  7. meerschaum/actions/show.py +128 -42
  8. meerschaum/actions/sync.py +7 -3
  9. meerschaum/api/__init__.py +24 -14
  10. meerschaum/api/_oauth2.py +4 -4
  11. meerschaum/api/dash/callbacks/dashboard.py +93 -23
  12. meerschaum/api/dash/callbacks/jobs.py +55 -3
  13. meerschaum/api/dash/jobs.py +34 -8
  14. meerschaum/api/dash/keys.py +1 -1
  15. meerschaum/api/dash/pages/dashboard.py +14 -4
  16. meerschaum/api/dash/pipes.py +137 -26
  17. meerschaum/api/dash/plugins.py +25 -9
  18. meerschaum/api/resources/static/js/xterm.js +1 -1
  19. meerschaum/api/resources/templates/termpage.html +3 -0
  20. meerschaum/api/routes/_login.py +5 -4
  21. meerschaum/api/routes/_plugins.py +6 -3
  22. meerschaum/config/_dash.py +11 -0
  23. meerschaum/config/_default.py +3 -1
  24. meerschaum/config/_jobs.py +13 -4
  25. meerschaum/config/_paths.py +2 -0
  26. meerschaum/config/_shell.py +0 -1
  27. meerschaum/config/_sync.py +2 -3
  28. meerschaum/config/_version.py +1 -1
  29. meerschaum/config/stack/__init__.py +6 -7
  30. meerschaum/config/stack/grafana/__init__.py +1 -1
  31. meerschaum/config/static/__init__.py +4 -1
  32. meerschaum/connectors/__init__.py +2 -0
  33. meerschaum/connectors/api/_plugins.py +2 -1
  34. meerschaum/connectors/sql/SQLConnector.py +4 -2
  35. meerschaum/connectors/sql/_create_engine.py +9 -9
  36. meerschaum/connectors/sql/_fetch.py +8 -11
  37. meerschaum/connectors/sql/_instance.py +3 -1
  38. meerschaum/connectors/sql/_pipes.py +61 -39
  39. meerschaum/connectors/sql/_plugins.py +0 -2
  40. meerschaum/connectors/sql/_sql.py +7 -9
  41. meerschaum/core/Pipe/_dtypes.py +2 -1
  42. meerschaum/core/Pipe/_sync.py +26 -13
  43. meerschaum/core/User/_User.py +158 -16
  44. meerschaum/core/User/__init__.py +1 -1
  45. meerschaum/plugins/_Plugin.py +12 -3
  46. meerschaum/plugins/__init__.py +23 -1
  47. meerschaum/utils/daemon/Daemon.py +89 -36
  48. meerschaum/utils/daemon/FileDescriptorInterceptor.py +140 -0
  49. meerschaum/utils/daemon/RotatingFile.py +130 -14
  50. meerschaum/utils/daemon/__init__.py +3 -0
  51. meerschaum/utils/dataframe.py +183 -8
  52. meerschaum/utils/dtypes/__init__.py +9 -5
  53. meerschaum/utils/formatting/_pipes.py +44 -10
  54. meerschaum/utils/misc.py +34 -2
  55. meerschaum/utils/packages/__init__.py +25 -8
  56. meerschaum/utils/packages/_packages.py +18 -20
  57. meerschaum/utils/process.py +13 -10
  58. meerschaum/utils/schedule.py +276 -30
  59. meerschaum/utils/threading.py +1 -0
  60. meerschaum/utils/typing.py +1 -1
  61. {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/METADATA +59 -62
  62. {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/RECORD +68 -66
  63. {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/WHEEL +1 -1
  64. {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/LICENSE +0 -0
  65. {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/NOTICE +0 -0
  66. {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/entry_points.txt +0 -0
  67. {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/top_level.txt +0 -0
  68. {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/zip-safe +0 -0
@@ -0,0 +1,140 @@
1
+ #! /usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # vim:fenc=utf-8
4
+
5
+ """
6
+ Intercept OS-level file descriptors.
7
+ """
8
+
9
+ import os
10
+ import select
11
+ import traceback
12
+ from threading import Event
13
+ from datetime import datetime
14
+ from meerschaum.utils.typing import Callable
15
+ from meerschaum.utils.warnings import warn
16
+
17
+ FD_CLOSED: int = 9
18
+ STOP_READING_FD_EVENT: Event = Event()
19
+
20
+ class FileDescriptorInterceptor:
21
+ """
22
+ A management class to intercept data written to a file descriptor.
23
+ """
24
+ def __init__(
25
+ self,
26
+ file_descriptor: int,
27
+ injection_hook: Callable[[], str],
28
+ ):
29
+ """
30
+ Parameters
31
+ ----------
32
+ file_descriptor: int
33
+ The OS file descriptor from which to read.
34
+
35
+ injection_hook: Callable[[], str]
36
+ A callable which returns a string to be injected into the written data.
37
+ """
38
+ self.stop_event = Event()
39
+ self.injection_hook = injection_hook
40
+ self.original_file_descriptor = file_descriptor
41
+ self.new_file_descriptor = os.dup(file_descriptor)
42
+ self.read_pipe, self.write_pipe = os.pipe()
43
+ self.signal_read_pipe, self.signal_write_pipe = os.pipe()
44
+ os.dup2(self.write_pipe, file_descriptor)
45
+
46
+ def start_interception(self):
47
+ """
48
+ Read from the file descriptor and write the modified data after injection.
49
+
50
+ NOTE: This is blocking and is meant to be run in a thread.
51
+ """
52
+ os.set_blocking(self.read_pipe, False)
53
+ os.set_blocking(self.signal_read_pipe, False)
54
+ is_first_read = True
55
+ while not self.stop_event.is_set():
56
+ try:
57
+ rlist, _, _ = select.select([self.read_pipe, self.signal_read_pipe], [], [], 0.1)
58
+ if self.signal_read_pipe in rlist:
59
+ break
60
+ if not rlist:
61
+ continue
62
+ data = os.read(self.read_pipe, 1024)
63
+ if not data:
64
+ break
65
+ except BlockingIOError:
66
+ continue
67
+ except OSError as e:
68
+ continue
69
+
70
+ first_char_is_newline = data[0] == b'\n'
71
+ last_char_is_newline = data[-1] == b'\n'
72
+
73
+ injected_str = self.injection_hook()
74
+ injected_bytes = injected_str.encode('utf-8')
75
+
76
+ if is_first_read:
77
+ data = b'\n' + data
78
+ is_first_read = False
79
+
80
+ modified_data = (
81
+ (data[:-1].replace(b'\n', b'\n' + injected_bytes) + b'\n')
82
+ if last_char_is_newline
83
+ else data.replace(b'\n', b'\n' + injected_bytes)
84
+ )
85
+ os.write(self.new_file_descriptor, modified_data)
86
+
87
+
88
+ def stop_interception(self):
89
+ """
90
+ Close the new file descriptors.
91
+ """
92
+ self.stop_event.set()
93
+ os.write(self.signal_write_pipe, b'\0')
94
+ try:
95
+ os.close(self.new_file_descriptor)
96
+ except OSError as e:
97
+ if e.errno != FD_CLOSED:
98
+ warn(
99
+ f"Error while trying to close the duplicated file descriptor:\n"
100
+ + f"{traceback.format_exc()}"
101
+ )
102
+
103
+ try:
104
+ os.close(self.write_pipe)
105
+ except OSError as e:
106
+ if e.errno != FD_CLOSED:
107
+ warn(
108
+ f"Error while trying to close the write-pipe "
109
+ + "to the intercepted file descriptor:\n"
110
+ + f"{traceback.format_exc()}"
111
+ )
112
+ try:
113
+ os.close(self.read_pipe)
114
+ except OSError as e:
115
+ if e.errno != FD_CLOSED:
116
+ warn(
117
+ f"Error while trying to close the read-pipe "
118
+ + "to the intercepted file descriptor:\n"
119
+ + f"{traceback.format_exc()}"
120
+ )
121
+
122
+ try:
123
+ os.close(self.signal_read_pipe)
124
+ except OSError as e:
125
+ if e.errno != FD_CLOSED:
126
+ warn(
127
+ f"Error while trying to close the signal-read-pipe "
128
+ + "to the intercepted file descriptor:\n"
129
+ + f"{traceback.format_exc()}"
130
+ )
131
+
132
+ try:
133
+ os.close(self.signal_write_pipe)
134
+ except OSError as e:
135
+ if e.errno != FD_CLOSED:
136
+ warn(
137
+ f"Error while trying to close the signal-write-pipe "
138
+ + "to the intercepted file descriptor:\n"
139
+ + f"{traceback.format_exc()}"
140
+ )
@@ -13,9 +13,13 @@ import pathlib
13
13
  import traceback
14
14
  import sys
15
15
  import atexit
16
+ from datetime import datetime, timezone, timedelta
16
17
  from typing import List, Union, Optional, Tuple
17
18
  from meerschaum.config import get_config
18
19
  from meerschaum.utils.warnings import warn
20
+ from meerschaum.utils.misc import round_time
21
+ from meerschaum.utils.daemon.FileDescriptorInterceptor import FileDescriptorInterceptor
22
+ from meerschaum.utils.threading import Thread
19
23
  import meerschaum as mrsm
20
24
  daemon = mrsm.attempt_import('daemon')
21
25
 
@@ -33,6 +37,8 @@ class RotatingFile(io.IOBase):
33
37
  num_files_to_keep: Optional[int] = None,
34
38
  max_file_size: Optional[int] = None,
35
39
  redirect_streams: bool = False,
40
+ write_timestamps: bool = False,
41
+ timestamp_format: str = '%Y-%m-%d %H:%M',
36
42
  ):
37
43
  """
38
44
  Create a file-like object which manages other files.
@@ -54,6 +60,9 @@ class RotatingFile(io.IOBase):
54
60
 
55
61
  NOTE: Only set this to `True` if you are entering into a daemon context.
56
62
  Doing so will redirect `sys.stdout` and `sys.stderr` into the log files.
63
+
64
+ write_timestamps: bool, default False
65
+ If `True`, prepend the current UTC timestamp to each line of the file.
57
66
  """
58
67
  self.file_path = pathlib.Path(file_path)
59
68
  if num_files_to_keep is None:
@@ -68,6 +77,8 @@ class RotatingFile(io.IOBase):
68
77
  self.num_files_to_keep = num_files_to_keep
69
78
  self.max_file_size = max_file_size
70
79
  self.redirect_streams = redirect_streams
80
+ self.write_timestamps = write_timestamps
81
+ self.timestamp_format = timestamp_format
71
82
  self.subfile_regex_pattern = re.compile(
72
83
  r'^'
73
84
  + self.file_path.name
@@ -91,7 +102,7 @@ class RotatingFile(io.IOBase):
91
102
  """
92
103
  Return the file descriptor for the latest subfile.
93
104
  """
94
- self.refresh_files()
105
+ self.refresh_files(start_interception=False)
95
106
  return self._current_file_obj.fileno()
96
107
 
97
108
 
@@ -221,7 +232,11 @@ class RotatingFile(io.IOBase):
221
232
  ]
222
233
 
223
234
 
224
- def refresh_files(self, potential_new_len: int = 0) -> '_io.TextUIWrapper':
235
+ def refresh_files(
236
+ self,
237
+ potential_new_len: int = 0,
238
+ start_interception: bool = False,
239
+ ) -> '_io.TextUIWrapper':
225
240
  """
226
241
  Check the state of the subfiles.
227
242
  If the latest subfile is too large, create a new file and delete old ones.
@@ -229,6 +244,9 @@ class RotatingFile(io.IOBase):
229
244
  Parameters
230
245
  ----------
231
246
  potential_new_len: int, default 0
247
+
248
+ start_interception: bool, default False
249
+ If `True`, kick off the file interception threads.
232
250
  """
233
251
  self.flush()
234
252
 
@@ -247,8 +265,15 @@ class RotatingFile(io.IOBase):
247
265
  if is_first_run_with_logs or lost_latest_handle:
248
266
  self._current_file_obj = open(latest_subfile_path, 'a+', encoding='utf-8')
249
267
  if self.redirect_streams:
250
- daemon.daemon.redirect_stream(sys.stdout, self._current_file_obj)
251
- daemon.daemon.redirect_stream(sys.stderr, self._current_file_obj)
268
+ try:
269
+ daemon.daemon.redirect_stream(sys.stdout, self._current_file_obj)
270
+ daemon.daemon.redirect_stream(sys.stderr, self._current_file_obj)
271
+ except OSError as e:
272
+ warn(
273
+ f"Encountered an issue when redirecting streams:\n{traceback.format_exc()}"
274
+ )
275
+ if start_interception and self.write_timestamps:
276
+ self.start_log_fd_interception()
252
277
 
253
278
  create_new_file = (
254
279
  (latest_subfile_index == -1)
@@ -276,9 +301,10 @@ class RotatingFile(io.IOBase):
276
301
 
277
302
  ### Sanity check in case writing somehow fails.
278
303
  if self._previous_file_obj is self._current_file_obj:
279
- self._previous_file_obj is None
304
+ self._previous_file_obj = None
280
305
 
281
306
  self.delete(unused_only=True)
307
+
282
308
  return self._current_file_obj
283
309
 
284
310
 
@@ -291,6 +317,7 @@ class RotatingFile(io.IOBase):
291
317
  unused_only: bool, default False
292
318
  If `True`, only close file descriptors not currently in use.
293
319
  """
320
+ self.stop_log_fd_interception(unused_only=unused_only)
294
321
  subfile_indices = sorted(self.subfile_objects.keys())
295
322
  for subfile_index in subfile_indices:
296
323
  subfile_object = self.subfile_objects[subfile_index]
@@ -298,19 +325,26 @@ class RotatingFile(io.IOBase):
298
325
  continue
299
326
  try:
300
327
  if not subfile_object.closed:
301
- # subfile_object.flush()
302
328
  subfile_object.close()
303
- _ = self.subfile_objects.pop(subfile_index, None)
304
- if self.redirect_streams:
305
- _ = self._redirected_subfile_objects.pop(subfile_index, None)
306
329
  except Exception as e:
307
330
  warn(f"Failed to close an open subfile:\n{traceback.format_exc()}")
308
331
 
332
+ _ = self.subfile_objects.pop(subfile_index, None)
333
+ if self.redirect_streams:
334
+ _ = self._redirected_subfile_objects.pop(subfile_index, None)
335
+
309
336
  if not unused_only:
310
337
  self._previous_file_obj = None
311
338
  self._current_file_obj = None
312
339
 
313
340
 
341
+ def get_timestamp_prefix_str(self) -> str:
342
+ """
343
+ Return the current minute prefixm string.
344
+ """
345
+ return datetime.now(timezone.utc).strftime(self.timestamp_format) + ' | '
346
+
347
+
314
348
  def write(self, data: str) -> None:
315
349
  """
316
350
  Write the given text into the latest subfile.
@@ -325,9 +359,18 @@ class RotatingFile(io.IOBase):
325
359
  if isinstance(data, bytes):
326
360
  data = data.decode('utf-8')
327
361
 
328
- self.refresh_files(potential_new_len=len(data))
362
+ prefix_str = self.get_timestamp_prefix_str() if self.write_timestamps else ""
363
+ suffix_str = "\n" if self.write_timestamps else ""
364
+ self.refresh_files(
365
+ potential_new_len = len(prefix_str + data + suffix_str),
366
+ start_interception = self.write_timestamps,
367
+ )
329
368
  try:
369
+ if prefix_str:
370
+ self._current_file_obj.write(prefix_str)
330
371
  self._current_file_obj.write(data)
372
+ if suffix_str:
373
+ self._current_file_obj.write(suffix_str)
331
374
  except Exception as e:
332
375
  warn(f"Failed to write to subfile:\n{traceback.format_exc()}")
333
376
  self.flush()
@@ -471,7 +514,7 @@ class RotatingFile(io.IOBase):
471
514
  subfile_object = self.subfile_objects[subfile_index]
472
515
  for i in range(self.SEEK_BACK_ATTEMPTS):
473
516
  try:
474
- subfile_object.seek(max(seek_ix - i), 0)
517
+ subfile_object.seek(max((seek_ix - i), 0))
475
518
  subfile_lines = subfile_object.readlines()
476
519
  except UnicodeDecodeError:
477
520
  continue
@@ -532,10 +575,83 @@ class RotatingFile(io.IOBase):
532
575
  try:
533
576
  subfile_object.flush()
534
577
  except Exception as e:
535
- warn(f"Failed to flush subfile:\n{traceback.format_exc()}")
578
+ warn(f"Failed to flush subfile {subfile_index}:\n{traceback.format_exc()}")
536
579
  if self.redirect_streams:
537
- sys.stdout.flush()
538
- sys.stderr.flush()
580
+ try:
581
+ sys.stdout.flush()
582
+ except Exception as e:
583
+ warn(f"Failed to flush STDOUT:\n{traceback.format_exc()}")
584
+ try:
585
+ sys.stderr.flush()
586
+ except Exception as e:
587
+ warn(f"Failed to flush STDERR:\n{traceback.format_exc()}")
588
+
589
+
590
+ def start_log_fd_interception(self):
591
+ """
592
+ Start the file descriptor monitoring threads.
593
+ """
594
+ if not self.write_timestamps:
595
+ return
596
+
597
+ threads = self.__dict__.get('_interceptor_threads', [])
598
+ self._stdout_interceptor = FileDescriptorInterceptor(
599
+ sys.stdout.fileno(),
600
+ self.get_timestamp_prefix_str,
601
+ )
602
+ self._stderr_interceptor = FileDescriptorInterceptor(
603
+ sys.stderr.fileno(),
604
+ self.get_timestamp_prefix_str,
605
+ )
606
+
607
+ self._stdout_interceptor_thread = Thread(
608
+ target = self._stdout_interceptor.start_interception,
609
+ daemon = True,
610
+ )
611
+ self._stderr_interceptor_thread = Thread(
612
+ target = self._stderr_interceptor.start_interception,
613
+ daemon = True,
614
+ )
615
+ self._stdout_interceptor_thread.start()
616
+ self._stderr_interceptor_thread.start()
617
+ self._intercepting = True
618
+
619
+ if '_interceptor_threads' not in self.__dict__:
620
+ self._interceptor_threads = []
621
+ if '_interceptors' not in self.__dict__:
622
+ self._interceptors = []
623
+ self._interceptor_threads.extend([
624
+ self._stdout_interceptor_thread,
625
+ self._stderr_interceptor_thread,
626
+ ])
627
+ self._interceptors.extend([
628
+ self._stdout_interceptor,
629
+ self._stderr_interceptor,
630
+ ])
631
+ self.stop_log_fd_interception(unused_only=True)
632
+
633
+
634
+ def stop_log_fd_interception(self, unused_only: bool = False):
635
+ """
636
+ Stop the file descriptor monitoring threads.
637
+ """
638
+ if not self.write_timestamps:
639
+ return
640
+ interceptors = self.__dict__.get('_interceptors', [])
641
+ interceptor_threads = self.__dict__.get('_interceptor_threads', [])
642
+
643
+ end_ix = len(interceptors) if not unused_only else -2
644
+
645
+ for interceptor in interceptors[:end_ix]:
646
+ interceptor.stop_interception()
647
+ del interceptors[:end_ix]
648
+
649
+ for thread in interceptor_threads[:end_ix]:
650
+ try:
651
+ thread.join()
652
+ except Exception as e:
653
+ warn(f"Failed to join interceptor threads:\n{traceback.format_exc()}")
654
+ del interceptor_threads[:end_ix]
539
655
 
540
656
 
541
657
  def __repr__(self) -> str:
@@ -12,6 +12,7 @@ from meerschaum.utils.typing import SuccessTuple, List, Optional, Callable, Any,
12
12
  from meerschaum.config._paths import DAEMON_RESOURCES_PATH
13
13
  from meerschaum.utils.daemon.Daemon import Daemon
14
14
  from meerschaum.utils.daemon.RotatingFile import RotatingFile
15
+ from meerschaum.utils.daemon.FileDescriptorInterceptor import FileDescriptorInterceptor
15
16
 
16
17
 
17
18
  def daemon_entry(sysargs: Optional[List[str]] = None) -> SuccessTuple:
@@ -63,6 +64,8 @@ def daemon_entry(sysargs: Optional[List[str]] = None) -> SuccessTuple:
63
64
 
64
65
  ### Only run if the kwargs equal or no actions are provided.
65
66
  if existing_kwargs == _args or not _args.get('action', []):
67
+ if daemon.status == 'running':
68
+ return True, f"Daemon '{daemon}' is already running."
66
69
  return daemon.run(
67
70
  debug = debug,
68
71
  allow_dirty_run = True,
@@ -7,9 +7,10 @@ Utility functions for working with DataFrames.
7
7
  """
8
8
 
9
9
  from __future__ import annotations
10
+ from datetime import datetime
10
11
  from meerschaum.utils.typing import (
11
12
  Optional, Dict, Any, List, Hashable, Generator,
12
- Iterator, Iterable, Union,
13
+ Iterator, Iterable, Union, Tuple,
13
14
  )
14
15
 
15
16
 
@@ -71,6 +72,7 @@ def add_missing_cols_to_df(df: 'pd.DataFrame', dtypes: Dict[str, Any]) -> pd.Dat
71
72
  def filter_unseen_df(
72
73
  old_df: 'pd.DataFrame',
73
74
  new_df: 'pd.DataFrame',
75
+ safe_copy: bool = True,
74
76
  dtypes: Optional[Dict[str, Any]] = None,
75
77
  debug: bool = False,
76
78
  ) -> 'pd.DataFrame':
@@ -84,6 +86,10 @@ def filter_unseen_df(
84
86
 
85
87
  new_df: 'pd.DataFrame'
86
88
  The fetched (source) dataframe. Rows that are contained in `old_df` are removed.
89
+
90
+ safe_copy: bool, default True
91
+ If `True`, create a copy before comparing and modifying the dataframes.
92
+ Setting to `False` may mutate the DataFrames.
87
93
 
88
94
  dtypes: Optional[Dict[str, Any]], default None
89
95
  Optionally specify the datatypes of the dataframe.
@@ -111,6 +117,10 @@ def filter_unseen_df(
111
117
  if old_df is None:
112
118
  return new_df
113
119
 
120
+ if safe_copy:
121
+ old_df = old_df.copy()
122
+ new_df = new_df.copy()
123
+
114
124
  import json
115
125
  import functools
116
126
  import traceback
@@ -118,6 +128,7 @@ def filter_unseen_df(
118
128
  from meerschaum.utils.warnings import warn
119
129
  from meerschaum.utils.packages import import_pandas, attempt_import
120
130
  from meerschaum.utils.dtypes import to_pandas_dtype, are_dtypes_equal, attempt_cast_to_numeric
131
+ from meerschaum.utils.debug import dprint
121
132
  pd = import_pandas(debug=debug)
122
133
  is_dask = 'dask' in new_df.__module__
123
134
  if is_dask:
@@ -243,12 +254,7 @@ def filter_unseen_df(
243
254
  indicator = True,
244
255
  )
245
256
  changed_rows_mask = (joined_df['_merge'] == 'left_only')
246
-
247
- delta_df = joined_df[
248
- list(new_df_dtypes.keys())
249
- ][
250
- changed_rows_mask
251
- ].reset_index(drop=True)
257
+ delta_df = joined_df[list(new_df_dtypes.keys())][changed_rows_mask].reset_index(drop=True)
252
258
 
253
259
  for json_col in json_cols:
254
260
  if json_col not in delta_df.columns:
@@ -535,6 +541,8 @@ def get_numeric_cols(df: 'pd.DataFrame') -> List[str]:
535
541
  def enforce_dtypes(
536
542
  df: 'pd.DataFrame',
537
543
  dtypes: Dict[str, str],
544
+ safe_copy: bool = True,
545
+ coerce_numeric: bool = True,
538
546
  debug: bool = False,
539
547
  ) -> 'pd.DataFrame':
540
548
  """
@@ -548,6 +556,14 @@ def enforce_dtypes(
548
556
  dtypes: Dict[str, str]
549
557
  The data types to attempt to enforce on the DataFrame.
550
558
 
559
+ safe_copy: bool, default True
560
+ If `True`, create a copy before comparing and modifying the dataframes.
561
+ Setting to `False` may mutate the DataFrames.
562
+ See `meerschaum.utils.dataframe.filter_unseen_df`.
563
+
564
+ coerce_numeric: bool, default True
565
+ If `True`, convert float and int collisions to numeric.
566
+
551
567
  debug: bool, default False
552
568
  Verbosity toggle.
553
569
 
@@ -569,6 +585,8 @@ def enforce_dtypes(
569
585
  is_dtype_numeric,
570
586
  attempt_cast_to_numeric,
571
587
  )
588
+ if safe_copy:
589
+ df = df.copy()
572
590
  df_dtypes = {c: str(t) for c, t in df.dtypes.items()}
573
591
  if len(df_dtypes) == 0:
574
592
  if debug:
@@ -674,7 +692,7 @@ def enforce_dtypes(
674
692
  explicitly_numeric
675
693
  or col in df_numeric_cols
676
694
  or (mixed_numeric_types and not explicitly_float)
677
- )
695
+ ) and coerce_numeric
678
696
  if cast_to_numeric:
679
697
  common_dtypes[col] = attempt_cast_to_numeric
680
698
  common_diff_dtypes[col] = attempt_cast_to_numeric
@@ -860,3 +878,160 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
860
878
  if len(pdf) > 0:
861
879
  return pdf
862
880
  return ddf.compute()
881
+
882
+
883
+ def query_df(
884
+ df: 'pd.DataFrame',
885
+ params: Optional[Dict[str, Any]] = None,
886
+ begin: Union[datetime, int, None] = None,
887
+ end: Union[datetime, int, None] = None,
888
+ datetime_column: Optional[str] = None,
889
+ select_columns: Optional[List[str]] = None,
890
+ omit_columns: Optional[List[str]] = None,
891
+ inplace: bool = False,
892
+ reset_index: bool = False,
893
+ debug: bool = False,
894
+ ) -> 'pd.DataFrame':
895
+ """
896
+ Query the dataframe with the params dictionary.
897
+
898
+ Parameters
899
+ ----------
900
+ df: pd.DataFrame
901
+ The DataFrame to query against.
902
+
903
+ params: Optional[Dict[str, Any]], default None
904
+ The parameters dictionary to use for the query.
905
+
906
+ begin: Union[datetime, int, None], default None
907
+ If `begin` and `datetime_column` are provided, only return rows with a timestamp
908
+ greater than or equal to this value.
909
+
910
+ end: Union[datetime, int, None], default None
911
+ If `begin` and `datetime_column` are provided, only return rows with a timestamp
912
+ less than this value.
913
+
914
+ datetime_column: Optional[str], default None
915
+ A `datetime_column` must be provided to use `begin` and `end`.
916
+
917
+ select_columns: Optional[List[str]], default None
918
+ If provided, only return these columns.
919
+
920
+ omit_columns: Optional[List[str]], default None
921
+ If provided, do not include these columns in the result.
922
+
923
+ inplace: bool, default False
924
+ If `True`, modify the DataFrame inplace rather than creating a new DataFrame.
925
+
926
+ reset_index: bool, default True
927
+ If `True`, reset the index in the resulting DataFrame.
928
+
929
+ Returns
930
+ -------
931
+ A Pandas DataFrame query result.
932
+ """
933
+ if not params and not begin and not end:
934
+ return df
935
+
936
+ import json
937
+ import meerschaum as mrsm
938
+ from meerschaum.utils.debug import dprint
939
+ from meerschaum.utils.misc import get_in_ex_params
940
+ from meerschaum.utils.warnings import warn
941
+
942
+ dtypes = {col: str(typ) for col, typ in df.dtypes.items()}
943
+
944
+ if begin or end:
945
+ if not datetime_column or datetime_column not in df.columns:
946
+ warn(
947
+ f"The datetime column '{datetime_column}' is not present in the Dataframe, "
948
+ + "ignoring begin and end...",
949
+ )
950
+ begin, end = None, None
951
+
952
+ if debug:
953
+ dprint(f"Querying dataframe:\n{params=} {begin=} {end=} {datetime_column=}")
954
+
955
+ in_ex_params = get_in_ex_params(params)
956
+
957
+ def serialize(x: Any) -> str:
958
+ if isinstance(x, (dict, list, tuple)):
959
+ return json.dumps(x, sort_keys=True, separators=(',', ':'), default=str)
960
+ if hasattr(x, 'isoformat'):
961
+ return x.isoformat()
962
+ return str(x)
963
+
964
+ masks = [
965
+ (
966
+ (df[datetime_column] >= begin)
967
+ if begin is not None and datetime_column
968
+ else True
969
+ ) & (
970
+ (df[datetime_column] < end)
971
+ if end is not None and datetime_column
972
+ else True
973
+ )
974
+ ]
975
+
976
+ masks.extend([
977
+ (
978
+ (
979
+ df[col].apply(serialize).isin(
980
+ [
981
+ serialize(_in_val)
982
+ for _in_val in in_vals
983
+ ]
984
+ ) if in_vals else True
985
+ ) & (
986
+ ~df[col].apply(serialize).isin(
987
+ [
988
+ serialize(_ex_val)
989
+ for _ex_val in ex_vals
990
+ ]
991
+ ) if ex_vals else True
992
+ )
993
+ )
994
+ for col, (in_vals, ex_vals) in in_ex_params.items()
995
+ if col in df.columns
996
+ ])
997
+ query_mask = masks[0]
998
+ for mask in masks:
999
+ query_mask = query_mask & mask
1000
+
1001
+ if inplace:
1002
+ df.where(query_mask, inplace=inplace)
1003
+ df.dropna(how='all', inplace=inplace)
1004
+ result_df = df
1005
+ else:
1006
+ result_df = df.where(query_mask).dropna(how='all')
1007
+
1008
+ if reset_index:
1009
+ result_df.reset_index(drop=True, inplace=True)
1010
+
1011
+ result_df = enforce_dtypes(
1012
+ result_df,
1013
+ dtypes,
1014
+ safe_copy = (not inplace),
1015
+ debug = debug,
1016
+ coerce_numeric = False,
1017
+ )
1018
+
1019
+ if select_columns == ['*']:
1020
+ select_columns = None
1021
+
1022
+ if not select_columns and not omit_columns:
1023
+ return result_df
1024
+
1025
+ if select_columns:
1026
+ for col in list(result_df.columns):
1027
+ if col not in select_columns:
1028
+ del result_df[col]
1029
+ return result_df
1030
+
1031
+ if omit_columns:
1032
+ for col in list(result_df.columns):
1033
+ if col in omit_columns:
1034
+ del result_df[col]
1035
+ if debug:
1036
+ dprint(f"{dtypes=}")
1037
+ return result_df