meerschaum 2.1.6__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/__main__.py +1 -1
- meerschaum/_internal/arguments/_parser.py +3 -0
- meerschaum/_internal/entry.py +3 -2
- meerschaum/_internal/shell/Shell.py +1 -6
- meerschaum/actions/api.py +1 -1
- meerschaum/actions/install.py +7 -3
- meerschaum/actions/show.py +128 -42
- meerschaum/actions/sync.py +7 -3
- meerschaum/api/__init__.py +24 -14
- meerschaum/api/_oauth2.py +4 -4
- meerschaum/api/dash/callbacks/dashboard.py +93 -23
- meerschaum/api/dash/callbacks/jobs.py +55 -3
- meerschaum/api/dash/jobs.py +34 -8
- meerschaum/api/dash/keys.py +1 -1
- meerschaum/api/dash/pages/dashboard.py +14 -4
- meerschaum/api/dash/pipes.py +137 -26
- meerschaum/api/dash/plugins.py +25 -9
- meerschaum/api/resources/static/js/xterm.js +1 -1
- meerschaum/api/resources/templates/termpage.html +3 -0
- meerschaum/api/routes/_login.py +5 -4
- meerschaum/api/routes/_plugins.py +6 -3
- meerschaum/config/_dash.py +11 -0
- meerschaum/config/_default.py +3 -1
- meerschaum/config/_jobs.py +13 -4
- meerschaum/config/_paths.py +2 -0
- meerschaum/config/_shell.py +0 -1
- meerschaum/config/_sync.py +2 -3
- meerschaum/config/_version.py +1 -1
- meerschaum/config/stack/__init__.py +6 -7
- meerschaum/config/stack/grafana/__init__.py +1 -1
- meerschaum/config/static/__init__.py +4 -1
- meerschaum/connectors/__init__.py +2 -0
- meerschaum/connectors/api/_plugins.py +2 -1
- meerschaum/connectors/sql/SQLConnector.py +4 -2
- meerschaum/connectors/sql/_create_engine.py +9 -9
- meerschaum/connectors/sql/_fetch.py +8 -11
- meerschaum/connectors/sql/_instance.py +3 -1
- meerschaum/connectors/sql/_pipes.py +61 -39
- meerschaum/connectors/sql/_plugins.py +0 -2
- meerschaum/connectors/sql/_sql.py +7 -9
- meerschaum/core/Pipe/_dtypes.py +2 -1
- meerschaum/core/Pipe/_sync.py +26 -13
- meerschaum/core/User/_User.py +158 -16
- meerschaum/core/User/__init__.py +1 -1
- meerschaum/plugins/_Plugin.py +12 -3
- meerschaum/plugins/__init__.py +23 -1
- meerschaum/utils/daemon/Daemon.py +89 -36
- meerschaum/utils/daemon/FileDescriptorInterceptor.py +140 -0
- meerschaum/utils/daemon/RotatingFile.py +130 -14
- meerschaum/utils/daemon/__init__.py +3 -0
- meerschaum/utils/dataframe.py +183 -8
- meerschaum/utils/dtypes/__init__.py +9 -5
- meerschaum/utils/formatting/_pipes.py +44 -10
- meerschaum/utils/misc.py +34 -2
- meerschaum/utils/packages/__init__.py +25 -8
- meerschaum/utils/packages/_packages.py +18 -20
- meerschaum/utils/process.py +13 -10
- meerschaum/utils/schedule.py +276 -30
- meerschaum/utils/threading.py +1 -0
- meerschaum/utils/typing.py +1 -1
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/METADATA +59 -62
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/RECORD +68 -66
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/WHEEL +1 -1
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/LICENSE +0 -0
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/NOTICE +0 -0
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/top_level.txt +0 -0
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dist-info}/zip-safe +0 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
#! /usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# vim:fenc=utf-8
|
4
|
+
|
5
|
+
"""
|
6
|
+
Intercept OS-level file descriptors.
|
7
|
+
"""
|
8
|
+
|
9
|
+
import os
|
10
|
+
import select
|
11
|
+
import traceback
|
12
|
+
from threading import Event
|
13
|
+
from datetime import datetime
|
14
|
+
from meerschaum.utils.typing import Callable
|
15
|
+
from meerschaum.utils.warnings import warn
|
16
|
+
|
17
|
+
FD_CLOSED: int = 9
|
18
|
+
STOP_READING_FD_EVENT: Event = Event()
|
19
|
+
|
20
|
+
class FileDescriptorInterceptor:
|
21
|
+
"""
|
22
|
+
A management class to intercept data written to a file descriptor.
|
23
|
+
"""
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
file_descriptor: int,
|
27
|
+
injection_hook: Callable[[], str],
|
28
|
+
):
|
29
|
+
"""
|
30
|
+
Parameters
|
31
|
+
----------
|
32
|
+
file_descriptor: int
|
33
|
+
The OS file descriptor from which to read.
|
34
|
+
|
35
|
+
injection_hook: Callable[[], str]
|
36
|
+
A callable which returns a string to be injected into the written data.
|
37
|
+
"""
|
38
|
+
self.stop_event = Event()
|
39
|
+
self.injection_hook = injection_hook
|
40
|
+
self.original_file_descriptor = file_descriptor
|
41
|
+
self.new_file_descriptor = os.dup(file_descriptor)
|
42
|
+
self.read_pipe, self.write_pipe = os.pipe()
|
43
|
+
self.signal_read_pipe, self.signal_write_pipe = os.pipe()
|
44
|
+
os.dup2(self.write_pipe, file_descriptor)
|
45
|
+
|
46
|
+
def start_interception(self):
|
47
|
+
"""
|
48
|
+
Read from the file descriptor and write the modified data after injection.
|
49
|
+
|
50
|
+
NOTE: This is blocking and is meant to be run in a thread.
|
51
|
+
"""
|
52
|
+
os.set_blocking(self.read_pipe, False)
|
53
|
+
os.set_blocking(self.signal_read_pipe, False)
|
54
|
+
is_first_read = True
|
55
|
+
while not self.stop_event.is_set():
|
56
|
+
try:
|
57
|
+
rlist, _, _ = select.select([self.read_pipe, self.signal_read_pipe], [], [], 0.1)
|
58
|
+
if self.signal_read_pipe in rlist:
|
59
|
+
break
|
60
|
+
if not rlist:
|
61
|
+
continue
|
62
|
+
data = os.read(self.read_pipe, 1024)
|
63
|
+
if not data:
|
64
|
+
break
|
65
|
+
except BlockingIOError:
|
66
|
+
continue
|
67
|
+
except OSError as e:
|
68
|
+
continue
|
69
|
+
|
70
|
+
first_char_is_newline = data[0] == b'\n'
|
71
|
+
last_char_is_newline = data[-1] == b'\n'
|
72
|
+
|
73
|
+
injected_str = self.injection_hook()
|
74
|
+
injected_bytes = injected_str.encode('utf-8')
|
75
|
+
|
76
|
+
if is_first_read:
|
77
|
+
data = b'\n' + data
|
78
|
+
is_first_read = False
|
79
|
+
|
80
|
+
modified_data = (
|
81
|
+
(data[:-1].replace(b'\n', b'\n' + injected_bytes) + b'\n')
|
82
|
+
if last_char_is_newline
|
83
|
+
else data.replace(b'\n', b'\n' + injected_bytes)
|
84
|
+
)
|
85
|
+
os.write(self.new_file_descriptor, modified_data)
|
86
|
+
|
87
|
+
|
88
|
+
def stop_interception(self):
|
89
|
+
"""
|
90
|
+
Close the new file descriptors.
|
91
|
+
"""
|
92
|
+
self.stop_event.set()
|
93
|
+
os.write(self.signal_write_pipe, b'\0')
|
94
|
+
try:
|
95
|
+
os.close(self.new_file_descriptor)
|
96
|
+
except OSError as e:
|
97
|
+
if e.errno != FD_CLOSED:
|
98
|
+
warn(
|
99
|
+
f"Error while trying to close the duplicated file descriptor:\n"
|
100
|
+
+ f"{traceback.format_exc()}"
|
101
|
+
)
|
102
|
+
|
103
|
+
try:
|
104
|
+
os.close(self.write_pipe)
|
105
|
+
except OSError as e:
|
106
|
+
if e.errno != FD_CLOSED:
|
107
|
+
warn(
|
108
|
+
f"Error while trying to close the write-pipe "
|
109
|
+
+ "to the intercepted file descriptor:\n"
|
110
|
+
+ f"{traceback.format_exc()}"
|
111
|
+
)
|
112
|
+
try:
|
113
|
+
os.close(self.read_pipe)
|
114
|
+
except OSError as e:
|
115
|
+
if e.errno != FD_CLOSED:
|
116
|
+
warn(
|
117
|
+
f"Error while trying to close the read-pipe "
|
118
|
+
+ "to the intercepted file descriptor:\n"
|
119
|
+
+ f"{traceback.format_exc()}"
|
120
|
+
)
|
121
|
+
|
122
|
+
try:
|
123
|
+
os.close(self.signal_read_pipe)
|
124
|
+
except OSError as e:
|
125
|
+
if e.errno != FD_CLOSED:
|
126
|
+
warn(
|
127
|
+
f"Error while trying to close the signal-read-pipe "
|
128
|
+
+ "to the intercepted file descriptor:\n"
|
129
|
+
+ f"{traceback.format_exc()}"
|
130
|
+
)
|
131
|
+
|
132
|
+
try:
|
133
|
+
os.close(self.signal_write_pipe)
|
134
|
+
except OSError as e:
|
135
|
+
if e.errno != FD_CLOSED:
|
136
|
+
warn(
|
137
|
+
f"Error while trying to close the signal-write-pipe "
|
138
|
+
+ "to the intercepted file descriptor:\n"
|
139
|
+
+ f"{traceback.format_exc()}"
|
140
|
+
)
|
@@ -13,9 +13,13 @@ import pathlib
|
|
13
13
|
import traceback
|
14
14
|
import sys
|
15
15
|
import atexit
|
16
|
+
from datetime import datetime, timezone, timedelta
|
16
17
|
from typing import List, Union, Optional, Tuple
|
17
18
|
from meerschaum.config import get_config
|
18
19
|
from meerschaum.utils.warnings import warn
|
20
|
+
from meerschaum.utils.misc import round_time
|
21
|
+
from meerschaum.utils.daemon.FileDescriptorInterceptor import FileDescriptorInterceptor
|
22
|
+
from meerschaum.utils.threading import Thread
|
19
23
|
import meerschaum as mrsm
|
20
24
|
daemon = mrsm.attempt_import('daemon')
|
21
25
|
|
@@ -33,6 +37,8 @@ class RotatingFile(io.IOBase):
|
|
33
37
|
num_files_to_keep: Optional[int] = None,
|
34
38
|
max_file_size: Optional[int] = None,
|
35
39
|
redirect_streams: bool = False,
|
40
|
+
write_timestamps: bool = False,
|
41
|
+
timestamp_format: str = '%Y-%m-%d %H:%M',
|
36
42
|
):
|
37
43
|
"""
|
38
44
|
Create a file-like object which manages other files.
|
@@ -54,6 +60,9 @@ class RotatingFile(io.IOBase):
|
|
54
60
|
|
55
61
|
NOTE: Only set this to `True` if you are entering into a daemon context.
|
56
62
|
Doing so will redirect `sys.stdout` and `sys.stderr` into the log files.
|
63
|
+
|
64
|
+
write_timestamps: bool, default False
|
65
|
+
If `True`, prepend the current UTC timestamp to each line of the file.
|
57
66
|
"""
|
58
67
|
self.file_path = pathlib.Path(file_path)
|
59
68
|
if num_files_to_keep is None:
|
@@ -68,6 +77,8 @@ class RotatingFile(io.IOBase):
|
|
68
77
|
self.num_files_to_keep = num_files_to_keep
|
69
78
|
self.max_file_size = max_file_size
|
70
79
|
self.redirect_streams = redirect_streams
|
80
|
+
self.write_timestamps = write_timestamps
|
81
|
+
self.timestamp_format = timestamp_format
|
71
82
|
self.subfile_regex_pattern = re.compile(
|
72
83
|
r'^'
|
73
84
|
+ self.file_path.name
|
@@ -91,7 +102,7 @@ class RotatingFile(io.IOBase):
|
|
91
102
|
"""
|
92
103
|
Return the file descriptor for the latest subfile.
|
93
104
|
"""
|
94
|
-
self.refresh_files()
|
105
|
+
self.refresh_files(start_interception=False)
|
95
106
|
return self._current_file_obj.fileno()
|
96
107
|
|
97
108
|
|
@@ -221,7 +232,11 @@ class RotatingFile(io.IOBase):
|
|
221
232
|
]
|
222
233
|
|
223
234
|
|
224
|
-
def refresh_files(
|
235
|
+
def refresh_files(
|
236
|
+
self,
|
237
|
+
potential_new_len: int = 0,
|
238
|
+
start_interception: bool = False,
|
239
|
+
) -> '_io.TextUIWrapper':
|
225
240
|
"""
|
226
241
|
Check the state of the subfiles.
|
227
242
|
If the latest subfile is too large, create a new file and delete old ones.
|
@@ -229,6 +244,9 @@ class RotatingFile(io.IOBase):
|
|
229
244
|
Parameters
|
230
245
|
----------
|
231
246
|
potential_new_len: int, default 0
|
247
|
+
|
248
|
+
start_interception: bool, default False
|
249
|
+
If `True`, kick off the file interception threads.
|
232
250
|
"""
|
233
251
|
self.flush()
|
234
252
|
|
@@ -247,8 +265,15 @@ class RotatingFile(io.IOBase):
|
|
247
265
|
if is_first_run_with_logs or lost_latest_handle:
|
248
266
|
self._current_file_obj = open(latest_subfile_path, 'a+', encoding='utf-8')
|
249
267
|
if self.redirect_streams:
|
250
|
-
|
251
|
-
|
268
|
+
try:
|
269
|
+
daemon.daemon.redirect_stream(sys.stdout, self._current_file_obj)
|
270
|
+
daemon.daemon.redirect_stream(sys.stderr, self._current_file_obj)
|
271
|
+
except OSError as e:
|
272
|
+
warn(
|
273
|
+
f"Encountered an issue when redirecting streams:\n{traceback.format_exc()}"
|
274
|
+
)
|
275
|
+
if start_interception and self.write_timestamps:
|
276
|
+
self.start_log_fd_interception()
|
252
277
|
|
253
278
|
create_new_file = (
|
254
279
|
(latest_subfile_index == -1)
|
@@ -276,9 +301,10 @@ class RotatingFile(io.IOBase):
|
|
276
301
|
|
277
302
|
### Sanity check in case writing somehow fails.
|
278
303
|
if self._previous_file_obj is self._current_file_obj:
|
279
|
-
self._previous_file_obj
|
304
|
+
self._previous_file_obj = None
|
280
305
|
|
281
306
|
self.delete(unused_only=True)
|
307
|
+
|
282
308
|
return self._current_file_obj
|
283
309
|
|
284
310
|
|
@@ -291,6 +317,7 @@ class RotatingFile(io.IOBase):
|
|
291
317
|
unused_only: bool, default False
|
292
318
|
If `True`, only close file descriptors not currently in use.
|
293
319
|
"""
|
320
|
+
self.stop_log_fd_interception(unused_only=unused_only)
|
294
321
|
subfile_indices = sorted(self.subfile_objects.keys())
|
295
322
|
for subfile_index in subfile_indices:
|
296
323
|
subfile_object = self.subfile_objects[subfile_index]
|
@@ -298,19 +325,26 @@ class RotatingFile(io.IOBase):
|
|
298
325
|
continue
|
299
326
|
try:
|
300
327
|
if not subfile_object.closed:
|
301
|
-
# subfile_object.flush()
|
302
328
|
subfile_object.close()
|
303
|
-
_ = self.subfile_objects.pop(subfile_index, None)
|
304
|
-
if self.redirect_streams:
|
305
|
-
_ = self._redirected_subfile_objects.pop(subfile_index, None)
|
306
329
|
except Exception as e:
|
307
330
|
warn(f"Failed to close an open subfile:\n{traceback.format_exc()}")
|
308
331
|
|
332
|
+
_ = self.subfile_objects.pop(subfile_index, None)
|
333
|
+
if self.redirect_streams:
|
334
|
+
_ = self._redirected_subfile_objects.pop(subfile_index, None)
|
335
|
+
|
309
336
|
if not unused_only:
|
310
337
|
self._previous_file_obj = None
|
311
338
|
self._current_file_obj = None
|
312
339
|
|
313
340
|
|
341
|
+
def get_timestamp_prefix_str(self) -> str:
|
342
|
+
"""
|
343
|
+
Return the current minute prefixm string.
|
344
|
+
"""
|
345
|
+
return datetime.now(timezone.utc).strftime(self.timestamp_format) + ' | '
|
346
|
+
|
347
|
+
|
314
348
|
def write(self, data: str) -> None:
|
315
349
|
"""
|
316
350
|
Write the given text into the latest subfile.
|
@@ -325,9 +359,18 @@ class RotatingFile(io.IOBase):
|
|
325
359
|
if isinstance(data, bytes):
|
326
360
|
data = data.decode('utf-8')
|
327
361
|
|
328
|
-
self.
|
362
|
+
prefix_str = self.get_timestamp_prefix_str() if self.write_timestamps else ""
|
363
|
+
suffix_str = "\n" if self.write_timestamps else ""
|
364
|
+
self.refresh_files(
|
365
|
+
potential_new_len = len(prefix_str + data + suffix_str),
|
366
|
+
start_interception = self.write_timestamps,
|
367
|
+
)
|
329
368
|
try:
|
369
|
+
if prefix_str:
|
370
|
+
self._current_file_obj.write(prefix_str)
|
330
371
|
self._current_file_obj.write(data)
|
372
|
+
if suffix_str:
|
373
|
+
self._current_file_obj.write(suffix_str)
|
331
374
|
except Exception as e:
|
332
375
|
warn(f"Failed to write to subfile:\n{traceback.format_exc()}")
|
333
376
|
self.flush()
|
@@ -471,7 +514,7 @@ class RotatingFile(io.IOBase):
|
|
471
514
|
subfile_object = self.subfile_objects[subfile_index]
|
472
515
|
for i in range(self.SEEK_BACK_ATTEMPTS):
|
473
516
|
try:
|
474
|
-
subfile_object.seek(max(seek_ix - i), 0)
|
517
|
+
subfile_object.seek(max((seek_ix - i), 0))
|
475
518
|
subfile_lines = subfile_object.readlines()
|
476
519
|
except UnicodeDecodeError:
|
477
520
|
continue
|
@@ -532,10 +575,83 @@ class RotatingFile(io.IOBase):
|
|
532
575
|
try:
|
533
576
|
subfile_object.flush()
|
534
577
|
except Exception as e:
|
535
|
-
warn(f"Failed to flush subfile:\n{traceback.format_exc()}")
|
578
|
+
warn(f"Failed to flush subfile {subfile_index}:\n{traceback.format_exc()}")
|
536
579
|
if self.redirect_streams:
|
537
|
-
|
538
|
-
|
580
|
+
try:
|
581
|
+
sys.stdout.flush()
|
582
|
+
except Exception as e:
|
583
|
+
warn(f"Failed to flush STDOUT:\n{traceback.format_exc()}")
|
584
|
+
try:
|
585
|
+
sys.stderr.flush()
|
586
|
+
except Exception as e:
|
587
|
+
warn(f"Failed to flush STDERR:\n{traceback.format_exc()}")
|
588
|
+
|
589
|
+
|
590
|
+
def start_log_fd_interception(self):
|
591
|
+
"""
|
592
|
+
Start the file descriptor monitoring threads.
|
593
|
+
"""
|
594
|
+
if not self.write_timestamps:
|
595
|
+
return
|
596
|
+
|
597
|
+
threads = self.__dict__.get('_interceptor_threads', [])
|
598
|
+
self._stdout_interceptor = FileDescriptorInterceptor(
|
599
|
+
sys.stdout.fileno(),
|
600
|
+
self.get_timestamp_prefix_str,
|
601
|
+
)
|
602
|
+
self._stderr_interceptor = FileDescriptorInterceptor(
|
603
|
+
sys.stderr.fileno(),
|
604
|
+
self.get_timestamp_prefix_str,
|
605
|
+
)
|
606
|
+
|
607
|
+
self._stdout_interceptor_thread = Thread(
|
608
|
+
target = self._stdout_interceptor.start_interception,
|
609
|
+
daemon = True,
|
610
|
+
)
|
611
|
+
self._stderr_interceptor_thread = Thread(
|
612
|
+
target = self._stderr_interceptor.start_interception,
|
613
|
+
daemon = True,
|
614
|
+
)
|
615
|
+
self._stdout_interceptor_thread.start()
|
616
|
+
self._stderr_interceptor_thread.start()
|
617
|
+
self._intercepting = True
|
618
|
+
|
619
|
+
if '_interceptor_threads' not in self.__dict__:
|
620
|
+
self._interceptor_threads = []
|
621
|
+
if '_interceptors' not in self.__dict__:
|
622
|
+
self._interceptors = []
|
623
|
+
self._interceptor_threads.extend([
|
624
|
+
self._stdout_interceptor_thread,
|
625
|
+
self._stderr_interceptor_thread,
|
626
|
+
])
|
627
|
+
self._interceptors.extend([
|
628
|
+
self._stdout_interceptor,
|
629
|
+
self._stderr_interceptor,
|
630
|
+
])
|
631
|
+
self.stop_log_fd_interception(unused_only=True)
|
632
|
+
|
633
|
+
|
634
|
+
def stop_log_fd_interception(self, unused_only: bool = False):
|
635
|
+
"""
|
636
|
+
Stop the file descriptor monitoring threads.
|
637
|
+
"""
|
638
|
+
if not self.write_timestamps:
|
639
|
+
return
|
640
|
+
interceptors = self.__dict__.get('_interceptors', [])
|
641
|
+
interceptor_threads = self.__dict__.get('_interceptor_threads', [])
|
642
|
+
|
643
|
+
end_ix = len(interceptors) if not unused_only else -2
|
644
|
+
|
645
|
+
for interceptor in interceptors[:end_ix]:
|
646
|
+
interceptor.stop_interception()
|
647
|
+
del interceptors[:end_ix]
|
648
|
+
|
649
|
+
for thread in interceptor_threads[:end_ix]:
|
650
|
+
try:
|
651
|
+
thread.join()
|
652
|
+
except Exception as e:
|
653
|
+
warn(f"Failed to join interceptor threads:\n{traceback.format_exc()}")
|
654
|
+
del interceptor_threads[:end_ix]
|
539
655
|
|
540
656
|
|
541
657
|
def __repr__(self) -> str:
|
@@ -12,6 +12,7 @@ from meerschaum.utils.typing import SuccessTuple, List, Optional, Callable, Any,
|
|
12
12
|
from meerschaum.config._paths import DAEMON_RESOURCES_PATH
|
13
13
|
from meerschaum.utils.daemon.Daemon import Daemon
|
14
14
|
from meerschaum.utils.daemon.RotatingFile import RotatingFile
|
15
|
+
from meerschaum.utils.daemon.FileDescriptorInterceptor import FileDescriptorInterceptor
|
15
16
|
|
16
17
|
|
17
18
|
def daemon_entry(sysargs: Optional[List[str]] = None) -> SuccessTuple:
|
@@ -63,6 +64,8 @@ def daemon_entry(sysargs: Optional[List[str]] = None) -> SuccessTuple:
|
|
63
64
|
|
64
65
|
### Only run if the kwargs equal or no actions are provided.
|
65
66
|
if existing_kwargs == _args or not _args.get('action', []):
|
67
|
+
if daemon.status == 'running':
|
68
|
+
return True, f"Daemon '{daemon}' is already running."
|
66
69
|
return daemon.run(
|
67
70
|
debug = debug,
|
68
71
|
allow_dirty_run = True,
|
meerschaum/utils/dataframe.py
CHANGED
@@ -7,9 +7,10 @@ Utility functions for working with DataFrames.
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
|
+
from datetime import datetime
|
10
11
|
from meerschaum.utils.typing import (
|
11
12
|
Optional, Dict, Any, List, Hashable, Generator,
|
12
|
-
Iterator, Iterable, Union,
|
13
|
+
Iterator, Iterable, Union, Tuple,
|
13
14
|
)
|
14
15
|
|
15
16
|
|
@@ -71,6 +72,7 @@ def add_missing_cols_to_df(df: 'pd.DataFrame', dtypes: Dict[str, Any]) -> pd.Dat
|
|
71
72
|
def filter_unseen_df(
|
72
73
|
old_df: 'pd.DataFrame',
|
73
74
|
new_df: 'pd.DataFrame',
|
75
|
+
safe_copy: bool = True,
|
74
76
|
dtypes: Optional[Dict[str, Any]] = None,
|
75
77
|
debug: bool = False,
|
76
78
|
) -> 'pd.DataFrame':
|
@@ -84,6 +86,10 @@ def filter_unseen_df(
|
|
84
86
|
|
85
87
|
new_df: 'pd.DataFrame'
|
86
88
|
The fetched (source) dataframe. Rows that are contained in `old_df` are removed.
|
89
|
+
|
90
|
+
safe_copy: bool, default True
|
91
|
+
If `True`, create a copy before comparing and modifying the dataframes.
|
92
|
+
Setting to `False` may mutate the DataFrames.
|
87
93
|
|
88
94
|
dtypes: Optional[Dict[str, Any]], default None
|
89
95
|
Optionally specify the datatypes of the dataframe.
|
@@ -111,6 +117,10 @@ def filter_unseen_df(
|
|
111
117
|
if old_df is None:
|
112
118
|
return new_df
|
113
119
|
|
120
|
+
if safe_copy:
|
121
|
+
old_df = old_df.copy()
|
122
|
+
new_df = new_df.copy()
|
123
|
+
|
114
124
|
import json
|
115
125
|
import functools
|
116
126
|
import traceback
|
@@ -118,6 +128,7 @@ def filter_unseen_df(
|
|
118
128
|
from meerschaum.utils.warnings import warn
|
119
129
|
from meerschaum.utils.packages import import_pandas, attempt_import
|
120
130
|
from meerschaum.utils.dtypes import to_pandas_dtype, are_dtypes_equal, attempt_cast_to_numeric
|
131
|
+
from meerschaum.utils.debug import dprint
|
121
132
|
pd = import_pandas(debug=debug)
|
122
133
|
is_dask = 'dask' in new_df.__module__
|
123
134
|
if is_dask:
|
@@ -243,12 +254,7 @@ def filter_unseen_df(
|
|
243
254
|
indicator = True,
|
244
255
|
)
|
245
256
|
changed_rows_mask = (joined_df['_merge'] == 'left_only')
|
246
|
-
|
247
|
-
delta_df = joined_df[
|
248
|
-
list(new_df_dtypes.keys())
|
249
|
-
][
|
250
|
-
changed_rows_mask
|
251
|
-
].reset_index(drop=True)
|
257
|
+
delta_df = joined_df[list(new_df_dtypes.keys())][changed_rows_mask].reset_index(drop=True)
|
252
258
|
|
253
259
|
for json_col in json_cols:
|
254
260
|
if json_col not in delta_df.columns:
|
@@ -535,6 +541,8 @@ def get_numeric_cols(df: 'pd.DataFrame') -> List[str]:
|
|
535
541
|
def enforce_dtypes(
|
536
542
|
df: 'pd.DataFrame',
|
537
543
|
dtypes: Dict[str, str],
|
544
|
+
safe_copy: bool = True,
|
545
|
+
coerce_numeric: bool = True,
|
538
546
|
debug: bool = False,
|
539
547
|
) -> 'pd.DataFrame':
|
540
548
|
"""
|
@@ -548,6 +556,14 @@ def enforce_dtypes(
|
|
548
556
|
dtypes: Dict[str, str]
|
549
557
|
The data types to attempt to enforce on the DataFrame.
|
550
558
|
|
559
|
+
safe_copy: bool, default True
|
560
|
+
If `True`, create a copy before comparing and modifying the dataframes.
|
561
|
+
Setting to `False` may mutate the DataFrames.
|
562
|
+
See `meerschaum.utils.dataframe.filter_unseen_df`.
|
563
|
+
|
564
|
+
coerce_numeric: bool, default True
|
565
|
+
If `True`, convert float and int collisions to numeric.
|
566
|
+
|
551
567
|
debug: bool, default False
|
552
568
|
Verbosity toggle.
|
553
569
|
|
@@ -569,6 +585,8 @@ def enforce_dtypes(
|
|
569
585
|
is_dtype_numeric,
|
570
586
|
attempt_cast_to_numeric,
|
571
587
|
)
|
588
|
+
if safe_copy:
|
589
|
+
df = df.copy()
|
572
590
|
df_dtypes = {c: str(t) for c, t in df.dtypes.items()}
|
573
591
|
if len(df_dtypes) == 0:
|
574
592
|
if debug:
|
@@ -674,7 +692,7 @@ def enforce_dtypes(
|
|
674
692
|
explicitly_numeric
|
675
693
|
or col in df_numeric_cols
|
676
694
|
or (mixed_numeric_types and not explicitly_float)
|
677
|
-
)
|
695
|
+
) and coerce_numeric
|
678
696
|
if cast_to_numeric:
|
679
697
|
common_dtypes[col] = attempt_cast_to_numeric
|
680
698
|
common_diff_dtypes[col] = attempt_cast_to_numeric
|
@@ -860,3 +878,160 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
|
|
860
878
|
if len(pdf) > 0:
|
861
879
|
return pdf
|
862
880
|
return ddf.compute()
|
881
|
+
|
882
|
+
|
883
|
+
def query_df(
|
884
|
+
df: 'pd.DataFrame',
|
885
|
+
params: Optional[Dict[str, Any]] = None,
|
886
|
+
begin: Union[datetime, int, None] = None,
|
887
|
+
end: Union[datetime, int, None] = None,
|
888
|
+
datetime_column: Optional[str] = None,
|
889
|
+
select_columns: Optional[List[str]] = None,
|
890
|
+
omit_columns: Optional[List[str]] = None,
|
891
|
+
inplace: bool = False,
|
892
|
+
reset_index: bool = False,
|
893
|
+
debug: bool = False,
|
894
|
+
) -> 'pd.DataFrame':
|
895
|
+
"""
|
896
|
+
Query the dataframe with the params dictionary.
|
897
|
+
|
898
|
+
Parameters
|
899
|
+
----------
|
900
|
+
df: pd.DataFrame
|
901
|
+
The DataFrame to query against.
|
902
|
+
|
903
|
+
params: Optional[Dict[str, Any]], default None
|
904
|
+
The parameters dictionary to use for the query.
|
905
|
+
|
906
|
+
begin: Union[datetime, int, None], default None
|
907
|
+
If `begin` and `datetime_column` are provided, only return rows with a timestamp
|
908
|
+
greater than or equal to this value.
|
909
|
+
|
910
|
+
end: Union[datetime, int, None], default None
|
911
|
+
If `begin` and `datetime_column` are provided, only return rows with a timestamp
|
912
|
+
less than this value.
|
913
|
+
|
914
|
+
datetime_column: Optional[str], default None
|
915
|
+
A `datetime_column` must be provided to use `begin` and `end`.
|
916
|
+
|
917
|
+
select_columns: Optional[List[str]], default None
|
918
|
+
If provided, only return these columns.
|
919
|
+
|
920
|
+
omit_columns: Optional[List[str]], default None
|
921
|
+
If provided, do not include these columns in the result.
|
922
|
+
|
923
|
+
inplace: bool, default False
|
924
|
+
If `True`, modify the DataFrame inplace rather than creating a new DataFrame.
|
925
|
+
|
926
|
+
reset_index: bool, default True
|
927
|
+
If `True`, reset the index in the resulting DataFrame.
|
928
|
+
|
929
|
+
Returns
|
930
|
+
-------
|
931
|
+
A Pandas DataFrame query result.
|
932
|
+
"""
|
933
|
+
if not params and not begin and not end:
|
934
|
+
return df
|
935
|
+
|
936
|
+
import json
|
937
|
+
import meerschaum as mrsm
|
938
|
+
from meerschaum.utils.debug import dprint
|
939
|
+
from meerschaum.utils.misc import get_in_ex_params
|
940
|
+
from meerschaum.utils.warnings import warn
|
941
|
+
|
942
|
+
dtypes = {col: str(typ) for col, typ in df.dtypes.items()}
|
943
|
+
|
944
|
+
if begin or end:
|
945
|
+
if not datetime_column or datetime_column not in df.columns:
|
946
|
+
warn(
|
947
|
+
f"The datetime column '{datetime_column}' is not present in the Dataframe, "
|
948
|
+
+ "ignoring begin and end...",
|
949
|
+
)
|
950
|
+
begin, end = None, None
|
951
|
+
|
952
|
+
if debug:
|
953
|
+
dprint(f"Querying dataframe:\n{params=} {begin=} {end=} {datetime_column=}")
|
954
|
+
|
955
|
+
in_ex_params = get_in_ex_params(params)
|
956
|
+
|
957
|
+
def serialize(x: Any) -> str:
|
958
|
+
if isinstance(x, (dict, list, tuple)):
|
959
|
+
return json.dumps(x, sort_keys=True, separators=(',', ':'), default=str)
|
960
|
+
if hasattr(x, 'isoformat'):
|
961
|
+
return x.isoformat()
|
962
|
+
return str(x)
|
963
|
+
|
964
|
+
masks = [
|
965
|
+
(
|
966
|
+
(df[datetime_column] >= begin)
|
967
|
+
if begin is not None and datetime_column
|
968
|
+
else True
|
969
|
+
) & (
|
970
|
+
(df[datetime_column] < end)
|
971
|
+
if end is not None and datetime_column
|
972
|
+
else True
|
973
|
+
)
|
974
|
+
]
|
975
|
+
|
976
|
+
masks.extend([
|
977
|
+
(
|
978
|
+
(
|
979
|
+
df[col].apply(serialize).isin(
|
980
|
+
[
|
981
|
+
serialize(_in_val)
|
982
|
+
for _in_val in in_vals
|
983
|
+
]
|
984
|
+
) if in_vals else True
|
985
|
+
) & (
|
986
|
+
~df[col].apply(serialize).isin(
|
987
|
+
[
|
988
|
+
serialize(_ex_val)
|
989
|
+
for _ex_val in ex_vals
|
990
|
+
]
|
991
|
+
) if ex_vals else True
|
992
|
+
)
|
993
|
+
)
|
994
|
+
for col, (in_vals, ex_vals) in in_ex_params.items()
|
995
|
+
if col in df.columns
|
996
|
+
])
|
997
|
+
query_mask = masks[0]
|
998
|
+
for mask in masks:
|
999
|
+
query_mask = query_mask & mask
|
1000
|
+
|
1001
|
+
if inplace:
|
1002
|
+
df.where(query_mask, inplace=inplace)
|
1003
|
+
df.dropna(how='all', inplace=inplace)
|
1004
|
+
result_df = df
|
1005
|
+
else:
|
1006
|
+
result_df = df.where(query_mask).dropna(how='all')
|
1007
|
+
|
1008
|
+
if reset_index:
|
1009
|
+
result_df.reset_index(drop=True, inplace=True)
|
1010
|
+
|
1011
|
+
result_df = enforce_dtypes(
|
1012
|
+
result_df,
|
1013
|
+
dtypes,
|
1014
|
+
safe_copy = (not inplace),
|
1015
|
+
debug = debug,
|
1016
|
+
coerce_numeric = False,
|
1017
|
+
)
|
1018
|
+
|
1019
|
+
if select_columns == ['*']:
|
1020
|
+
select_columns = None
|
1021
|
+
|
1022
|
+
if not select_columns and not omit_columns:
|
1023
|
+
return result_df
|
1024
|
+
|
1025
|
+
if select_columns:
|
1026
|
+
for col in list(result_df.columns):
|
1027
|
+
if col not in select_columns:
|
1028
|
+
del result_df[col]
|
1029
|
+
return result_df
|
1030
|
+
|
1031
|
+
if omit_columns:
|
1032
|
+
for col in list(result_df.columns):
|
1033
|
+
if col in omit_columns:
|
1034
|
+
del result_df[col]
|
1035
|
+
if debug:
|
1036
|
+
dprint(f"{dtypes=}")
|
1037
|
+
return result_df
|