meerschaum 2.1.6__py3-none-any.whl → 2.2.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/_internal/arguments/_parser.py +3 -0
- meerschaum/_internal/entry.py +2 -1
- meerschaum/_internal/shell/Shell.py +1 -6
- meerschaum/actions/api.py +1 -1
- meerschaum/actions/install.py +7 -3
- meerschaum/actions/sync.py +7 -3
- meerschaum/api/dash/callbacks/dashboard.py +77 -8
- meerschaum/api/dash/callbacks/jobs.py +55 -3
- meerschaum/api/dash/jobs.py +34 -8
- meerschaum/api/dash/pipes.py +79 -11
- meerschaum/api/resources/static/js/xterm.js +1 -1
- meerschaum/config/_shell.py +0 -1
- meerschaum/config/_version.py +1 -1
- meerschaum/connectors/api/_plugins.py +2 -1
- meerschaum/connectors/sql/_create_engine.py +5 -5
- meerschaum/connectors/sql/_fetch.py +8 -11
- meerschaum/connectors/sql/_pipes.py +7 -1
- meerschaum/core/Pipe/_dtypes.py +2 -1
- meerschaum/core/Pipe/_sync.py +26 -13
- meerschaum/plugins/_Plugin.py +11 -2
- meerschaum/utils/daemon/Daemon.py +11 -3
- meerschaum/utils/dataframe.py +183 -8
- meerschaum/utils/dtypes/__init__.py +9 -5
- meerschaum/utils/formatting/_pipes.py +44 -10
- meerschaum/utils/misc.py +34 -2
- meerschaum/utils/packages/__init__.py +4 -3
- meerschaum/utils/packages/_packages.py +1 -1
- meerschaum/utils/typing.py +1 -1
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dev1.dist-info}/METADATA +5 -5
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dev1.dist-info}/RECORD +36 -36
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dev1.dist-info}/LICENSE +0 -0
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dev1.dist-info}/NOTICE +0 -0
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dev1.dist-info}/WHEEL +0 -0
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dev1.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dev1.dist-info}/top_level.txt +0 -0
- {meerschaum-2.1.6.dist-info → meerschaum-2.2.0.dev1.dist-info}/zip-safe +0 -0
meerschaum/config/_shell.py
CHANGED
meerschaum/config/_version.py
CHANGED
@@ -49,6 +49,7 @@ def register_plugin(
|
|
49
49
|
def install_plugin(
|
50
50
|
self,
|
51
51
|
name: str,
|
52
|
+
skip_deps: bool = False,
|
52
53
|
force: bool = False,
|
53
54
|
debug: bool = False
|
54
55
|
) -> SuccessTuple:
|
@@ -78,7 +79,7 @@ def install_plugin(
|
|
78
79
|
success, msg = False, fail_msg
|
79
80
|
return success, msg
|
80
81
|
plugin = Plugin(name, archive_path=archive_path, repo_connector=self)
|
81
|
-
return plugin.install(force=force, debug=debug)
|
82
|
+
return plugin.install(skip_deps=skip_deps, force=force, debug=debug)
|
82
83
|
|
83
84
|
def get_plugins(
|
84
85
|
self,
|
@@ -154,10 +154,10 @@ install_flavor_drivers = {
|
|
154
154
|
'duckdb': ['duckdb', 'duckdb_engine'],
|
155
155
|
'mysql': ['pymysql'],
|
156
156
|
'mariadb': ['pymysql'],
|
157
|
-
'timescaledb': ['
|
158
|
-
'postgresql': ['
|
159
|
-
'citus': ['
|
160
|
-
'cockroachdb': ['
|
157
|
+
'timescaledb': ['psycopg'],
|
158
|
+
'postgresql': ['psycopg'],
|
159
|
+
'citus': ['psycopg'],
|
160
|
+
'cockroachdb': ['psycopg', 'sqlalchemy_cockroachdb', 'sqlalchemy_cockroachdb.psycopg'],
|
161
161
|
'mssql': ['pyodbc'],
|
162
162
|
'oracle': ['cx_Oracle'],
|
163
163
|
}
|
@@ -165,7 +165,7 @@ require_patching_flavors = {'cockroachdb': [('sqlalchemy-cockroachdb', 'sqlalche
|
|
165
165
|
|
166
166
|
flavor_dialects = {
|
167
167
|
'cockroachdb': (
|
168
|
-
'cockroachdb', 'sqlalchemy_cockroachdb.
|
168
|
+
'cockroachdb', 'sqlalchemy_cockroachdb.psycopg', 'CockroachDBDialect_psycopg'
|
169
169
|
),
|
170
170
|
'duckdb': ('duckdb', 'duckdb_engine', 'Dialect'),
|
171
171
|
}
|
@@ -174,9 +174,6 @@ def get_pipe_metadef(
|
|
174
174
|
)
|
175
175
|
|
176
176
|
|
177
|
-
if 'order by' in definition.lower() and 'over' not in definition.lower():
|
178
|
-
error("Cannot fetch with an ORDER clause in the definition")
|
179
|
-
|
180
177
|
apply_backtrack = begin == '' and check_existing
|
181
178
|
backtrack_interval = pipe.get_backtrack_interval(check_existing=check_existing, debug=debug)
|
182
179
|
btm = (
|
@@ -308,9 +305,9 @@ def _simple_fetch_query(pipe, debug: bool=False, **kw) -> str:
|
|
308
305
|
def_name = 'definition'
|
309
306
|
definition = get_pipe_query(pipe)
|
310
307
|
return (
|
311
|
-
f"WITH {def_name} AS ({definition}) SELECT * FROM {def_name}"
|
308
|
+
f"WITH {def_name} AS (\n{definition}\n) SELECT * FROM {def_name}"
|
312
309
|
if pipe.connector.flavor not in ('mysql', 'mariadb')
|
313
|
-
else f"SELECT * FROM ({definition}) AS {def_name}"
|
310
|
+
else f"SELECT * FROM (\n{definition}\n) AS {def_name}"
|
314
311
|
)
|
315
312
|
|
316
313
|
def _join_fetch_query(
|
@@ -363,10 +360,10 @@ def _join_fetch_query(
|
|
363
360
|
)
|
364
361
|
+ f") AS {id_remote_name}, "
|
365
362
|
+ dateadd_str(
|
366
|
-
flavor=pipe.connector.flavor,
|
367
|
-
begin=_st,
|
368
|
-
datepart='minute',
|
369
|
-
number=pipe.parameters.get('fetch', {}).get('backtrack_minutes', 0)
|
363
|
+
flavor = pipe.connector.flavor,
|
364
|
+
begin = _st,
|
365
|
+
datepart = 'minute',
|
366
|
+
number = pipe.parameters.get('fetch', {}).get('backtrack_minutes', 0)
|
370
367
|
) + " AS " + dt_remote_name + "\nUNION ALL\n"
|
371
368
|
)
|
372
369
|
_sync_times_q = _sync_times_q[:(-1 * len('UNION ALL\n'))] + ")"
|
@@ -374,13 +371,13 @@ def _join_fetch_query(
|
|
374
371
|
definition = get_pipe_query(pipe)
|
375
372
|
query = (
|
376
373
|
f"""
|
377
|
-
WITH definition AS ({definition}){_sync_times_q}
|
374
|
+
WITH definition AS (\n{definition}\n){_sync_times_q}
|
378
375
|
SELECT definition.*
|
379
376
|
FROM definition"""
|
380
377
|
if pipe.connector.flavor not in ('mysql', 'mariadb')
|
381
378
|
else (
|
382
379
|
f"""
|
383
|
-
SELECT * FROM ({definition}) AS definition"""
|
380
|
+
SELECT * FROM (\n{definition}\n) AS definition"""
|
384
381
|
)
|
385
382
|
) + f"""
|
386
383
|
LEFT OUTER JOIN {sync_times_remote_name} AS st
|
@@ -1182,7 +1182,12 @@ def sync_pipe(
|
|
1182
1182
|
dprint("Fetched data:\n" + str(df))
|
1183
1183
|
|
1184
1184
|
if not isinstance(df, pd.DataFrame):
|
1185
|
-
df = pipe.enforce_dtypes(
|
1185
|
+
df = pipe.enforce_dtypes(
|
1186
|
+
df,
|
1187
|
+
chunksize = chunksize,
|
1188
|
+
safe_copy = kw.get('safe_copy', False),
|
1189
|
+
debug = debug,
|
1190
|
+
)
|
1186
1191
|
|
1187
1192
|
### if table does not exist, create it with indices
|
1188
1193
|
is_new = False
|
@@ -1226,6 +1231,7 @@ def sync_pipe(
|
|
1226
1231
|
upsert = pipe.parameters.get('upsert', False) and (self.flavor + '-upsert') in update_queries
|
1227
1232
|
if upsert:
|
1228
1233
|
check_existing = False
|
1234
|
+
kw['safe_copy'] = kw.get('safe_copy', False)
|
1229
1235
|
|
1230
1236
|
unseen_df, update_df, delta_df = (
|
1231
1237
|
pipe.filter_existing(
|
meerschaum/core/Pipe/_dtypes.py
CHANGED
@@ -14,6 +14,7 @@ def enforce_dtypes(
|
|
14
14
|
self,
|
15
15
|
df: 'pd.DataFrame',
|
16
16
|
chunksize: Optional[int] = -1,
|
17
|
+
safe_copy: bool = True,
|
17
18
|
debug: bool = False,
|
18
19
|
) -> 'pd.DataFrame':
|
19
20
|
"""
|
@@ -71,7 +72,7 @@ def enforce_dtypes(
|
|
71
72
|
)
|
72
73
|
return df
|
73
74
|
|
74
|
-
return _enforce_dtypes(df, pipe_dtypes, debug=debug)
|
75
|
+
return _enforce_dtypes(df, pipe_dtypes, safe_copy=safe_copy, debug=debug)
|
75
76
|
|
76
77
|
|
77
78
|
def infer_dtypes(self, persist: bool=False, debug: bool=False) -> Dict[str, Any]:
|
meerschaum/core/Pipe/_sync.py
CHANGED
@@ -12,6 +12,7 @@ import json
|
|
12
12
|
import time
|
13
13
|
import threading
|
14
14
|
import multiprocessing
|
15
|
+
import functools
|
15
16
|
from datetime import datetime, timedelta
|
16
17
|
|
17
18
|
from meerschaum.utils.typing import (
|
@@ -518,6 +519,8 @@ def exists(
|
|
518
519
|
def filter_existing(
|
519
520
|
self,
|
520
521
|
df: 'pd.DataFrame',
|
522
|
+
safe_copy: bool = True,
|
523
|
+
date_bound_only: bool = False,
|
521
524
|
chunksize: Optional[int] = -1,
|
522
525
|
debug: bool = False,
|
523
526
|
**kw
|
@@ -530,6 +533,14 @@ def filter_existing(
|
|
530
533
|
df: 'pd.DataFrame'
|
531
534
|
The dataframe to inspect and filter.
|
532
535
|
|
536
|
+
safe_copy: bool, default True
|
537
|
+
If `True`, create a copy before comparing and modifying the dataframes.
|
538
|
+
Setting to `False` may mutate the DataFrames.
|
539
|
+
See `meerschaum.utils.dataframe.filter_unseen_df`.
|
540
|
+
|
541
|
+
date_bound_only: bool, default False
|
542
|
+
If `True`, only use the datetime index to fetch the sample dataframe.
|
543
|
+
|
533
544
|
chunksize: Optional[int], default -1
|
534
545
|
The `chunksize` used when fetching existing data.
|
535
546
|
|
@@ -567,7 +578,8 @@ def filter_existing(
|
|
567
578
|
else:
|
568
579
|
merge = pd.merge
|
569
580
|
NA = pd.NA
|
570
|
-
|
581
|
+
if df is None:
|
582
|
+
return df, df, df
|
571
583
|
if (df.empty if not is_dask else len(df) == 0):
|
572
584
|
return df, df, df
|
573
585
|
|
@@ -617,7 +629,7 @@ def filter_existing(
|
|
617
629
|
traceback.print_exc()
|
618
630
|
max_dt = None
|
619
631
|
|
620
|
-
if
|
632
|
+
if ('datetime' not in str(type(max_dt))) or str(min_dt) == 'NaT':
|
621
633
|
if 'int' not in str(type(max_dt)).lower():
|
622
634
|
max_dt = None
|
623
635
|
|
@@ -645,7 +657,7 @@ def filter_existing(
|
|
645
657
|
col: df[col].unique()
|
646
658
|
for col in self.columns
|
647
659
|
if col in df.columns and col != dt_col
|
648
|
-
}
|
660
|
+
} if not date_bound_only else {}
|
649
661
|
filter_params_index_limit = get_config('pipes', 'sync', 'filter_params_index_limit')
|
650
662
|
_ = kw.pop('params', None)
|
651
663
|
params = {
|
@@ -655,7 +667,7 @@ def filter_existing(
|
|
655
667
|
]
|
656
668
|
for col, unique_vals in unique_index_vals.items()
|
657
669
|
if len(unique_vals) <= filter_params_index_limit
|
658
|
-
}
|
670
|
+
} if not date_bound_only else {}
|
659
671
|
|
660
672
|
if debug:
|
661
673
|
dprint(f"Looking at data between '{begin}' and '{end}':", **kw)
|
@@ -698,18 +710,23 @@ def filter_existing(
|
|
698
710
|
col: to_pandas_dtype(typ)
|
699
711
|
for col, typ in self_dtypes.items()
|
700
712
|
},
|
713
|
+
safe_copy = safe_copy,
|
701
714
|
debug = debug
|
702
715
|
),
|
703
716
|
on_cols_dtypes,
|
704
717
|
)
|
705
718
|
|
706
719
|
### Cast dicts or lists to strings so we can merge.
|
720
|
+
serializer = functools.partial(json.dumps, sort_keys=True, separators=(',', ':'), default=str)
|
721
|
+
def deserializer(x):
|
722
|
+
return json.loads(x) if isinstance(x, str) else x
|
723
|
+
|
707
724
|
unhashable_delta_cols = get_unhashable_cols(delta_df)
|
708
725
|
unhashable_backtrack_cols = get_unhashable_cols(backtrack_df)
|
709
726
|
for col in unhashable_delta_cols:
|
710
|
-
delta_df[col] = delta_df[col].apply(
|
727
|
+
delta_df[col] = delta_df[col].apply(serializer)
|
711
728
|
for col in unhashable_backtrack_cols:
|
712
|
-
backtrack_df[col] = backtrack_df[col].apply(
|
729
|
+
backtrack_df[col] = backtrack_df[col].apply(serializer)
|
713
730
|
casted_cols = set(unhashable_delta_cols + unhashable_backtrack_cols)
|
714
731
|
|
715
732
|
joined_df = merge(
|
@@ -722,13 +739,9 @@ def filter_existing(
|
|
722
739
|
) if on_cols else delta_df
|
723
740
|
for col in casted_cols:
|
724
741
|
if col in joined_df.columns:
|
725
|
-
joined_df[col] = joined_df[col].apply(
|
726
|
-
|
727
|
-
|
728
|
-
if isinstance(x, str)
|
729
|
-
else x
|
730
|
-
)
|
731
|
-
)
|
742
|
+
joined_df[col] = joined_df[col].apply(deserializer)
|
743
|
+
if col in delta_df.columns:
|
744
|
+
delta_df[col] = delta_df[col].apply(deserializer)
|
732
745
|
|
733
746
|
### Determine which rows are completely new.
|
734
747
|
new_rows_mask = (joined_df['_merge'] == 'left_only') if on_cols else None
|
meerschaum/plugins/_Plugin.py
CHANGED
@@ -252,6 +252,7 @@ class Plugin:
|
|
252
252
|
|
253
253
|
def install(
|
254
254
|
self,
|
255
|
+
skip_deps: bool = False,
|
255
256
|
force: bool = False,
|
256
257
|
debug: bool = False,
|
257
258
|
) -> SuccessTuple:
|
@@ -263,6 +264,9 @@ class Plugin:
|
|
263
264
|
|
264
265
|
Parameters
|
265
266
|
----------
|
267
|
+
skip_deps: bool, default False
|
268
|
+
If `True`, do not install dependencies.
|
269
|
+
|
266
270
|
force: bool, default False
|
267
271
|
If `True`, continue with installation, even if required packages fail to install.
|
268
272
|
|
@@ -366,7 +370,11 @@ class Plugin:
|
|
366
370
|
plugin_installation_dir_path = path
|
367
371
|
break
|
368
372
|
|
369
|
-
success_msg =
|
373
|
+
success_msg = (
|
374
|
+
f"Successfully installed plugin '{self}'"
|
375
|
+
+ ("\n (skipped dependencies)" if skip_deps else "")
|
376
|
+
+ "."
|
377
|
+
)
|
370
378
|
success, abort = None, None
|
371
379
|
|
372
380
|
if is_same_version and not force:
|
@@ -423,7 +431,8 @@ class Plugin:
|
|
423
431
|
return success, msg
|
424
432
|
|
425
433
|
### attempt to install dependencies
|
426
|
-
|
434
|
+
dependencies_installed = skip_deps or self.install_dependencies(force=force, debug=debug)
|
435
|
+
if not dependencies_installed:
|
427
436
|
_ongoing_installations.remove(self.full_name)
|
428
437
|
return False, f"Failed to install dependencies for plugin '{self}'."
|
429
438
|
|
@@ -865,21 +865,29 @@ class Daemon:
|
|
865
865
|
error(_write_pickle_success_tuple[1])
|
866
866
|
|
867
867
|
|
868
|
-
def cleanup(self, keep_logs: bool = False) ->
|
869
|
-
"""
|
868
|
+
def cleanup(self, keep_logs: bool = False) -> SuccessTuple:
|
869
|
+
"""
|
870
|
+
Remove a daemon's directory after execution.
|
870
871
|
|
871
872
|
Parameters
|
872
873
|
----------
|
873
874
|
keep_logs: bool, default False
|
874
875
|
If `True`, skip deleting the daemon's log files.
|
876
|
+
|
877
|
+
Returns
|
878
|
+
-------
|
879
|
+
A `SuccessTuple` indicating success.
|
875
880
|
"""
|
876
881
|
if self.path.exists():
|
877
882
|
try:
|
878
883
|
shutil.rmtree(self.path)
|
879
884
|
except Exception as e:
|
880
|
-
|
885
|
+
msg = f"Failed to clean up '{self.daemon_id}':\n{e}"
|
886
|
+
warn(msg)
|
887
|
+
return False, msg
|
881
888
|
if not keep_logs:
|
882
889
|
self.rotating_log.delete()
|
890
|
+
return True, "Success"
|
883
891
|
|
884
892
|
|
885
893
|
def get_timeout_seconds(self, timeout: Union[int, float, None] = None) -> Union[int, float]:
|
meerschaum/utils/dataframe.py
CHANGED
@@ -7,9 +7,10 @@ Utility functions for working with DataFrames.
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
|
+
from datetime import datetime
|
10
11
|
from meerschaum.utils.typing import (
|
11
12
|
Optional, Dict, Any, List, Hashable, Generator,
|
12
|
-
Iterator, Iterable, Union,
|
13
|
+
Iterator, Iterable, Union, Tuple,
|
13
14
|
)
|
14
15
|
|
15
16
|
|
@@ -71,6 +72,7 @@ def add_missing_cols_to_df(df: 'pd.DataFrame', dtypes: Dict[str, Any]) -> pd.Dat
|
|
71
72
|
def filter_unseen_df(
|
72
73
|
old_df: 'pd.DataFrame',
|
73
74
|
new_df: 'pd.DataFrame',
|
75
|
+
safe_copy: bool = True,
|
74
76
|
dtypes: Optional[Dict[str, Any]] = None,
|
75
77
|
debug: bool = False,
|
76
78
|
) -> 'pd.DataFrame':
|
@@ -84,6 +86,10 @@ def filter_unseen_df(
|
|
84
86
|
|
85
87
|
new_df: 'pd.DataFrame'
|
86
88
|
The fetched (source) dataframe. Rows that are contained in `old_df` are removed.
|
89
|
+
|
90
|
+
safe_copy: bool, default True
|
91
|
+
If `True`, create a copy before comparing and modifying the dataframes.
|
92
|
+
Setting to `False` may mutate the DataFrames.
|
87
93
|
|
88
94
|
dtypes: Optional[Dict[str, Any]], default None
|
89
95
|
Optionally specify the datatypes of the dataframe.
|
@@ -111,6 +117,10 @@ def filter_unseen_df(
|
|
111
117
|
if old_df is None:
|
112
118
|
return new_df
|
113
119
|
|
120
|
+
if safe_copy:
|
121
|
+
old_df = old_df.copy()
|
122
|
+
new_df = new_df.copy()
|
123
|
+
|
114
124
|
import json
|
115
125
|
import functools
|
116
126
|
import traceback
|
@@ -118,6 +128,7 @@ def filter_unseen_df(
|
|
118
128
|
from meerschaum.utils.warnings import warn
|
119
129
|
from meerschaum.utils.packages import import_pandas, attempt_import
|
120
130
|
from meerschaum.utils.dtypes import to_pandas_dtype, are_dtypes_equal, attempt_cast_to_numeric
|
131
|
+
from meerschaum.utils.debug import dprint
|
121
132
|
pd = import_pandas(debug=debug)
|
122
133
|
is_dask = 'dask' in new_df.__module__
|
123
134
|
if is_dask:
|
@@ -243,12 +254,7 @@ def filter_unseen_df(
|
|
243
254
|
indicator = True,
|
244
255
|
)
|
245
256
|
changed_rows_mask = (joined_df['_merge'] == 'left_only')
|
246
|
-
|
247
|
-
delta_df = joined_df[
|
248
|
-
list(new_df_dtypes.keys())
|
249
|
-
][
|
250
|
-
changed_rows_mask
|
251
|
-
].reset_index(drop=True)
|
257
|
+
delta_df = joined_df[list(new_df_dtypes.keys())][changed_rows_mask].reset_index(drop=True)
|
252
258
|
|
253
259
|
for json_col in json_cols:
|
254
260
|
if json_col not in delta_df.columns:
|
@@ -535,6 +541,8 @@ def get_numeric_cols(df: 'pd.DataFrame') -> List[str]:
|
|
535
541
|
def enforce_dtypes(
|
536
542
|
df: 'pd.DataFrame',
|
537
543
|
dtypes: Dict[str, str],
|
544
|
+
safe_copy: bool = True,
|
545
|
+
coerce_numeric: bool = True,
|
538
546
|
debug: bool = False,
|
539
547
|
) -> 'pd.DataFrame':
|
540
548
|
"""
|
@@ -548,6 +556,14 @@ def enforce_dtypes(
|
|
548
556
|
dtypes: Dict[str, str]
|
549
557
|
The data types to attempt to enforce on the DataFrame.
|
550
558
|
|
559
|
+
safe_copy: bool, default True
|
560
|
+
If `True`, create a copy before comparing and modifying the dataframes.
|
561
|
+
Setting to `False` may mutate the DataFrames.
|
562
|
+
See `meerschaum.utils.dataframe.filter_unseen_df`.
|
563
|
+
|
564
|
+
coerce_numeric: bool, default True
|
565
|
+
If `True`, convert float and int collisions to numeric.
|
566
|
+
|
551
567
|
debug: bool, default False
|
552
568
|
Verbosity toggle.
|
553
569
|
|
@@ -569,6 +585,8 @@ def enforce_dtypes(
|
|
569
585
|
is_dtype_numeric,
|
570
586
|
attempt_cast_to_numeric,
|
571
587
|
)
|
588
|
+
if safe_copy:
|
589
|
+
df = df.copy()
|
572
590
|
df_dtypes = {c: str(t) for c, t in df.dtypes.items()}
|
573
591
|
if len(df_dtypes) == 0:
|
574
592
|
if debug:
|
@@ -674,7 +692,7 @@ def enforce_dtypes(
|
|
674
692
|
explicitly_numeric
|
675
693
|
or col in df_numeric_cols
|
676
694
|
or (mixed_numeric_types and not explicitly_float)
|
677
|
-
)
|
695
|
+
) and coerce_numeric
|
678
696
|
if cast_to_numeric:
|
679
697
|
common_dtypes[col] = attempt_cast_to_numeric
|
680
698
|
common_diff_dtypes[col] = attempt_cast_to_numeric
|
@@ -860,3 +878,160 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
|
|
860
878
|
if len(pdf) > 0:
|
861
879
|
return pdf
|
862
880
|
return ddf.compute()
|
881
|
+
|
882
|
+
|
883
|
+
def query_df(
|
884
|
+
df: 'pd.DataFrame',
|
885
|
+
params: Optional[Dict[str, Any]] = None,
|
886
|
+
begin: Union[datetime, int, None] = None,
|
887
|
+
end: Union[datetime, int, None] = None,
|
888
|
+
datetime_column: Optional[str] = None,
|
889
|
+
select_columns: Optional[List[str]] = None,
|
890
|
+
omit_columns: Optional[List[str]] = None,
|
891
|
+
inplace: bool = False,
|
892
|
+
reset_index: bool = False,
|
893
|
+
debug: bool = False,
|
894
|
+
) -> 'pd.DataFrame':
|
895
|
+
"""
|
896
|
+
Query the dataframe with the params dictionary.
|
897
|
+
|
898
|
+
Parameters
|
899
|
+
----------
|
900
|
+
df: pd.DataFrame
|
901
|
+
The DataFrame to query against.
|
902
|
+
|
903
|
+
params: Optional[Dict[str, Any]], default None
|
904
|
+
The parameters dictionary to use for the query.
|
905
|
+
|
906
|
+
begin: Union[datetime, int, None], default None
|
907
|
+
If `begin` and `datetime_column` are provided, only return rows with a timestamp
|
908
|
+
greater than or equal to this value.
|
909
|
+
|
910
|
+
end: Union[datetime, int, None], default None
|
911
|
+
If `begin` and `datetime_column` are provided, only return rows with a timestamp
|
912
|
+
less than this value.
|
913
|
+
|
914
|
+
datetime_column: Optional[str], default None
|
915
|
+
A `datetime_column` must be provided to use `begin` and `end`.
|
916
|
+
|
917
|
+
select_columns: Optional[List[str]], default None
|
918
|
+
If provided, only return these columns.
|
919
|
+
|
920
|
+
omit_columns: Optional[List[str]], default None
|
921
|
+
If provided, do not include these columns in the result.
|
922
|
+
|
923
|
+
inplace: bool, default False
|
924
|
+
If `True`, modify the DataFrame inplace rather than creating a new DataFrame.
|
925
|
+
|
926
|
+
reset_index: bool, default True
|
927
|
+
If `True`, reset the index in the resulting DataFrame.
|
928
|
+
|
929
|
+
Returns
|
930
|
+
-------
|
931
|
+
A Pandas DataFrame query result.
|
932
|
+
"""
|
933
|
+
if not params and not begin and not end:
|
934
|
+
return df
|
935
|
+
|
936
|
+
import json
|
937
|
+
import meerschaum as mrsm
|
938
|
+
from meerschaum.utils.debug import dprint
|
939
|
+
from meerschaum.utils.misc import get_in_ex_params
|
940
|
+
from meerschaum.utils.warnings import warn
|
941
|
+
|
942
|
+
dtypes = {col: str(typ) for col, typ in df.dtypes.items()}
|
943
|
+
|
944
|
+
if begin or end:
|
945
|
+
if not datetime_column or datetime_column not in df.columns:
|
946
|
+
warn(
|
947
|
+
f"The datetime column '{datetime_column}' is not present in the Dataframe, "
|
948
|
+
+ "ignoring begin and end...",
|
949
|
+
)
|
950
|
+
begin, end = None, None
|
951
|
+
|
952
|
+
if debug:
|
953
|
+
dprint(f"Querying dataframe:\n{params=} {begin=} {end=} {datetime_column=}")
|
954
|
+
|
955
|
+
in_ex_params = get_in_ex_params(params)
|
956
|
+
|
957
|
+
def serialize(x: Any) -> str:
|
958
|
+
if isinstance(x, (dict, list, tuple)):
|
959
|
+
return json.dumps(x, sort_keys=True, separators=(',', ':'), default=str)
|
960
|
+
if hasattr(x, 'isoformat'):
|
961
|
+
return x.isoformat()
|
962
|
+
return str(x)
|
963
|
+
|
964
|
+
masks = [
|
965
|
+
(
|
966
|
+
(df[datetime_column] >= begin)
|
967
|
+
if begin is not None and datetime_column
|
968
|
+
else True
|
969
|
+
) & (
|
970
|
+
(df[datetime_column] < end)
|
971
|
+
if end is not None and datetime_column
|
972
|
+
else True
|
973
|
+
)
|
974
|
+
]
|
975
|
+
|
976
|
+
masks.extend([
|
977
|
+
(
|
978
|
+
(
|
979
|
+
df[col].apply(serialize).isin(
|
980
|
+
[
|
981
|
+
serialize(_in_val)
|
982
|
+
for _in_val in in_vals
|
983
|
+
]
|
984
|
+
) if in_vals else True
|
985
|
+
) & (
|
986
|
+
~df[col].apply(serialize).isin(
|
987
|
+
[
|
988
|
+
serialize(_ex_val)
|
989
|
+
for _ex_val in ex_vals
|
990
|
+
]
|
991
|
+
) if ex_vals else True
|
992
|
+
)
|
993
|
+
)
|
994
|
+
for col, (in_vals, ex_vals) in in_ex_params.items()
|
995
|
+
if col in df.columns
|
996
|
+
])
|
997
|
+
query_mask = masks[0]
|
998
|
+
for mask in masks:
|
999
|
+
query_mask = query_mask & mask
|
1000
|
+
|
1001
|
+
if inplace:
|
1002
|
+
df.where(query_mask, inplace=inplace)
|
1003
|
+
df.dropna(how='all', inplace=inplace)
|
1004
|
+
result_df = df
|
1005
|
+
else:
|
1006
|
+
result_df = df.where(query_mask).dropna(how='all')
|
1007
|
+
|
1008
|
+
if reset_index:
|
1009
|
+
result_df.reset_index(drop=True, inplace=True)
|
1010
|
+
|
1011
|
+
result_df = enforce_dtypes(
|
1012
|
+
result_df,
|
1013
|
+
dtypes,
|
1014
|
+
safe_copy = (not inplace),
|
1015
|
+
debug = debug,
|
1016
|
+
coerce_numeric = False,
|
1017
|
+
)
|
1018
|
+
|
1019
|
+
if select_columns == ['*']:
|
1020
|
+
select_columns = None
|
1021
|
+
|
1022
|
+
if not select_columns and not omit_columns:
|
1023
|
+
return result_df
|
1024
|
+
|
1025
|
+
if select_columns:
|
1026
|
+
for col in list(result_df.columns):
|
1027
|
+
if col not in select_columns:
|
1028
|
+
del result_df[col]
|
1029
|
+
return result_df
|
1030
|
+
|
1031
|
+
if omit_columns:
|
1032
|
+
for col in list(result_df.columns):
|
1033
|
+
if col in omit_columns:
|
1034
|
+
del result_df[col]
|
1035
|
+
if debug:
|
1036
|
+
dprint(f"{dtypes=}")
|
1037
|
+
return result_df
|
@@ -6,8 +6,10 @@
|
|
6
6
|
Utility functions for working with data types.
|
7
7
|
"""
|
8
8
|
|
9
|
+
import traceback
|
9
10
|
from decimal import Decimal, Context, InvalidOperation
|
10
11
|
from meerschaum.utils.typing import Dict, Union, Any
|
12
|
+
from meerschaum.utils.warnings import warn
|
11
13
|
|
12
14
|
MRSM_PD_DTYPES: Dict[str, str] = {
|
13
15
|
'json': 'object',
|
@@ -37,9 +39,7 @@ def to_pandas_dtype(dtype: str) -> str:
|
|
37
39
|
from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
|
38
40
|
return get_pd_type_from_db_type(dtype)
|
39
41
|
|
40
|
-
import traceback
|
41
42
|
from meerschaum.utils.packages import attempt_import
|
42
|
-
from meerschaum.utils.warnings import warn
|
43
43
|
pandas = attempt_import('pandas', lazy=False)
|
44
44
|
|
45
45
|
try:
|
@@ -88,8 +88,12 @@ def are_dtypes_equal(
|
|
88
88
|
return False
|
89
89
|
return True
|
90
90
|
|
91
|
-
|
92
|
-
|
91
|
+
try:
|
92
|
+
if ldtype == rdtype:
|
93
|
+
return True
|
94
|
+
except Exception as e:
|
95
|
+
warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
|
96
|
+
return False
|
93
97
|
|
94
98
|
### Sometimes pandas dtype objects are passed.
|
95
99
|
ldtype = str(ldtype)
|
@@ -177,7 +181,7 @@ def attempt_cast_to_numeric(value: Any) -> Any:
|
|
177
181
|
return value
|
178
182
|
|
179
183
|
|
180
|
-
def value_is_null(value: Any) ->
|
184
|
+
def value_is_null(value: Any) -> bool:
|
181
185
|
"""
|
182
186
|
Determine if a value is a null-like string.
|
183
187
|
"""
|