meerschaum 2.3.6__py3-none-any.whl → 2.4.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/actions/bootstrap.py +36 -10
- meerschaum/actions/copy.py +3 -3
- meerschaum/actions/start.py +13 -14
- meerschaum/api/dash/__init__.py +7 -6
- meerschaum/api/dash/callbacks/__init__.py +1 -0
- meerschaum/api/dash/callbacks/dashboard.py +7 -5
- meerschaum/api/dash/callbacks/pipes.py +42 -0
- meerschaum/api/dash/pages/__init__.py +1 -0
- meerschaum/api/dash/pages/pipes.py +16 -0
- meerschaum/api/dash/pipes.py +79 -47
- meerschaum/api/dash/users.py +19 -6
- meerschaum/api/routes/_login.py +4 -4
- meerschaum/api/routes/_pipes.py +3 -3
- meerschaum/config/_default.py +9 -1
- meerschaum/config/_version.py +1 -1
- meerschaum/config/stack/__init__.py +59 -16
- meerschaum/connectors/Connector.py +19 -13
- meerschaum/connectors/__init__.py +9 -5
- meerschaum/connectors/poll.py +30 -24
- meerschaum/connectors/sql/_pipes.py +126 -154
- meerschaum/connectors/sql/_plugins.py +45 -43
- meerschaum/connectors/sql/_users.py +46 -38
- meerschaum/connectors/valkey/ValkeyConnector.py +535 -0
- meerschaum/connectors/valkey/__init__.py +8 -0
- meerschaum/connectors/valkey/_fetch.py +75 -0
- meerschaum/connectors/valkey/_pipes.py +839 -0
- meerschaum/connectors/valkey/_plugins.py +265 -0
- meerschaum/connectors/valkey/_users.py +305 -0
- meerschaum/core/Pipe/__init__.py +3 -0
- meerschaum/core/Pipe/_attributes.py +1 -2
- meerschaum/core/Pipe/_clear.py +16 -13
- meerschaum/core/Pipe/_copy.py +106 -0
- meerschaum/core/Pipe/_drop.py +4 -4
- meerschaum/core/Pipe/_dtypes.py +14 -14
- meerschaum/core/Pipe/_edit.py +15 -14
- meerschaum/core/Pipe/_sync.py +134 -51
- meerschaum/core/Pipe/_verify.py +11 -11
- meerschaum/core/User/_User.py +14 -12
- meerschaum/plugins/_Plugin.py +17 -13
- meerschaum/utils/_get_pipes.py +14 -20
- meerschaum/utils/dataframe.py +288 -101
- meerschaum/utils/dtypes/__init__.py +31 -6
- meerschaum/utils/dtypes/sql.py +4 -4
- meerschaum/utils/misc.py +3 -3
- meerschaum/utils/packages/_packages.py +1 -0
- {meerschaum-2.3.6.dist-info → meerschaum-2.4.0.dev1.dist-info}/METADATA +3 -1
- {meerschaum-2.3.6.dist-info → meerschaum-2.4.0.dev1.dist-info}/RECORD +53 -44
- {meerschaum-2.3.6.dist-info → meerschaum-2.4.0.dev1.dist-info}/WHEEL +1 -1
- {meerschaum-2.3.6.dist-info → meerschaum-2.4.0.dev1.dist-info}/LICENSE +0 -0
- {meerschaum-2.3.6.dist-info → meerschaum-2.4.0.dev1.dist-info}/NOTICE +0 -0
- {meerschaum-2.3.6.dist-info → meerschaum-2.4.0.dev1.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.3.6.dist-info → meerschaum-2.4.0.dev1.dist-info}/top_level.txt +0 -0
- {meerschaum-2.3.6.dist-info → meerschaum-2.4.0.dev1.dist-info}/zip-safe +0 -0
meerschaum/utils/dataframe.py
CHANGED
@@ -7,7 +7,8 @@ Utility functions for working with DataFrames.
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
|
-
from datetime import datetime
|
10
|
+
from datetime import datetime, timezone
|
11
|
+
from collections import defaultdict
|
11
12
|
|
12
13
|
import meerschaum as mrsm
|
13
14
|
from meerschaum.utils.typing import (
|
@@ -78,12 +79,13 @@ def add_missing_cols_to_df(
|
|
78
79
|
|
79
80
|
|
80
81
|
def filter_unseen_df(
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
82
|
+
old_df: 'pd.DataFrame',
|
83
|
+
new_df: 'pd.DataFrame',
|
84
|
+
safe_copy: bool = True,
|
85
|
+
dtypes: Optional[Dict[str, Any]] = None,
|
86
|
+
include_unchanged_columns: bool = False,
|
87
|
+
debug: bool = False,
|
88
|
+
) -> 'pd.DataFrame':
|
87
89
|
"""
|
88
90
|
Left join two DataFrames to find the newest unseen data.
|
89
91
|
|
@@ -102,6 +104,9 @@ def filter_unseen_df(
|
|
102
104
|
dtypes: Optional[Dict[str, Any]], default None
|
103
105
|
Optionally specify the datatypes of the dataframe.
|
104
106
|
|
107
|
+
include_unchanged_columns: bool, default False
|
108
|
+
If `True`, include columns which haven't changed on rows which have changed.
|
109
|
+
|
105
110
|
debug: bool, default False
|
106
111
|
Verbosity toggle.
|
107
112
|
|
@@ -135,8 +140,12 @@ def filter_unseen_df(
|
|
135
140
|
from decimal import Decimal
|
136
141
|
from meerschaum.utils.warnings import warn
|
137
142
|
from meerschaum.utils.packages import import_pandas, attempt_import
|
138
|
-
from meerschaum.utils.dtypes import
|
139
|
-
|
143
|
+
from meerschaum.utils.dtypes import (
|
144
|
+
to_pandas_dtype,
|
145
|
+
are_dtypes_equal,
|
146
|
+
attempt_cast_to_numeric,
|
147
|
+
coerce_timezone,
|
148
|
+
)
|
140
149
|
pd = import_pandas(debug=debug)
|
141
150
|
is_dask = 'dask' in new_df.__module__
|
142
151
|
if is_dask:
|
@@ -156,6 +165,19 @@ def filter_unseen_df(
|
|
156
165
|
new_df = add_missing_cols_to_df(new_df, old_df_dtypes)
|
157
166
|
old_df = add_missing_cols_to_df(old_df, new_df_dtypes)
|
158
167
|
|
168
|
+
new_types_missing_from_old = {
|
169
|
+
col: typ
|
170
|
+
for col, typ in new_df_dtypes.items()
|
171
|
+
if col not in old_df_dtypes
|
172
|
+
}
|
173
|
+
old_types_missing_from_new = {
|
174
|
+
col: typ
|
175
|
+
for col, typ in new_df_dtypes.items()
|
176
|
+
if col not in old_df_dtypes
|
177
|
+
}
|
178
|
+
old_df_dtypes.update(new_types_missing_from_old)
|
179
|
+
new_df_dtypes.update(old_types_missing_from_new)
|
180
|
+
|
159
181
|
### Edge case: two empty lists cast to DFs.
|
160
182
|
elif len(new_df.columns) == 0:
|
161
183
|
return new_df
|
@@ -163,6 +185,7 @@ def filter_unseen_df(
|
|
163
185
|
try:
|
164
186
|
### Order matters when checking equality.
|
165
187
|
new_df = new_df[old_df.columns]
|
188
|
+
|
166
189
|
except Exception as e:
|
167
190
|
warn(
|
168
191
|
"Was not able to cast old columns onto new DataFrame. " +
|
@@ -183,16 +206,38 @@ def filter_unseen_df(
|
|
183
206
|
for col, typ in new_df_dtypes.items():
|
184
207
|
if col not in dtypes:
|
185
208
|
dtypes[col] = typ
|
186
|
-
|
187
|
-
|
209
|
+
|
210
|
+
dt_dtypes = {
|
211
|
+
col: typ
|
212
|
+
for col, typ in dtypes.items()
|
213
|
+
if are_dtypes_equal(typ, 'datetime')
|
214
|
+
}
|
215
|
+
non_dt_dtypes = {
|
216
|
+
col: typ
|
217
|
+
for col, typ in dtypes.items()
|
218
|
+
if col not in dt_dtypes
|
219
|
+
}
|
220
|
+
|
221
|
+
cast_non_dt_cols = True
|
188
222
|
try:
|
189
|
-
new_df = new_df.astype(
|
190
|
-
|
223
|
+
new_df = new_df.astype(non_dt_dtypes)
|
224
|
+
cast_non_dt_cols = False
|
191
225
|
except Exception as e:
|
192
226
|
warn(
|
193
227
|
f"Was not able to cast the new DataFrame to the given dtypes.\n{e}"
|
194
228
|
)
|
195
229
|
|
230
|
+
cast_dt_cols = True
|
231
|
+
try:
|
232
|
+
for col, typ in dt_dtypes.items():
|
233
|
+
tz = typ.split(',')[-1].strip() if ',' in typ else None
|
234
|
+
new_df[col] = coerce_timezone(pd.to_datetime(new_df[col], utc=True))
|
235
|
+
cast_dt_cols = False
|
236
|
+
except Exception as e:
|
237
|
+
warn(f"Could not cast datetime columns:\n{e}")
|
238
|
+
|
239
|
+
cast_cols = cast_dt_cols or cast_non_dt_cols
|
240
|
+
|
196
241
|
new_numeric_cols_existing = get_numeric_cols(new_df)
|
197
242
|
old_numeric_cols = get_numeric_cols(old_df)
|
198
243
|
for col, typ in {k: v for k, v in dtypes.items()}.items():
|
@@ -257,19 +302,20 @@ def filter_unseen_df(
|
|
257
302
|
joined_df = merge(
|
258
303
|
new_df.fillna(NA),
|
259
304
|
old_df.fillna(NA),
|
260
|
-
how
|
261
|
-
on
|
262
|
-
indicator
|
305
|
+
how='left',
|
306
|
+
on=None,
|
307
|
+
indicator=True,
|
263
308
|
)
|
264
309
|
changed_rows_mask = (joined_df['_merge'] == 'left_only')
|
265
|
-
|
310
|
+
new_cols = list(new_df_dtypes)
|
311
|
+
delta_df = joined_df[new_cols][changed_rows_mask].reset_index(drop=True)
|
266
312
|
|
267
313
|
for json_col in json_cols:
|
268
314
|
if json_col not in delta_df.columns:
|
269
315
|
continue
|
270
316
|
try:
|
271
317
|
delta_df[json_col] = delta_df[json_col].apply(json.loads)
|
272
|
-
except Exception
|
318
|
+
except Exception:
|
273
319
|
warn(f"Unable to deserialize JSON column '{json_col}':\n{traceback.format_exc()}")
|
274
320
|
|
275
321
|
for numeric_col in numeric_cols:
|
@@ -277,19 +323,19 @@ def filter_unseen_df(
|
|
277
323
|
continue
|
278
324
|
try:
|
279
325
|
delta_df[numeric_col] = delta_df[numeric_col].apply(attempt_cast_to_numeric)
|
280
|
-
except Exception
|
326
|
+
except Exception:
|
281
327
|
warn(f"Unable to parse numeric column '{numeric_col}':\n{traceback.format_exc()}")
|
282
328
|
|
283
329
|
return delta_df
|
284
330
|
|
285
331
|
|
286
332
|
def parse_df_datetimes(
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
333
|
+
df: 'pd.DataFrame',
|
334
|
+
ignore_cols: Optional[Iterable[str]] = None,
|
335
|
+
chunksize: Optional[int] = None,
|
336
|
+
dtype_backend: str = 'numpy_nullable',
|
337
|
+
debug: bool = False,
|
338
|
+
) -> 'pd.DataFrame':
|
293
339
|
"""
|
294
340
|
Parse a pandas DataFrame for datetime columns and cast as datetimes.
|
295
341
|
|
@@ -461,6 +507,8 @@ def get_unhashable_cols(df: 'pd.DataFrame') -> List[str]:
|
|
461
507
|
-------
|
462
508
|
A list of columns.
|
463
509
|
"""
|
510
|
+
if df is None:
|
511
|
+
return []
|
464
512
|
if len(df) == 0:
|
465
513
|
return []
|
466
514
|
|
@@ -547,12 +595,12 @@ def get_numeric_cols(df: 'pd.DataFrame') -> List[str]:
|
|
547
595
|
|
548
596
|
|
549
597
|
def enforce_dtypes(
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
598
|
+
df: 'pd.DataFrame',
|
599
|
+
dtypes: Dict[str, str],
|
600
|
+
safe_copy: bool = True,
|
601
|
+
coerce_numeric: bool = True,
|
602
|
+
debug: bool = False,
|
603
|
+
) -> 'pd.DataFrame':
|
556
604
|
"""
|
557
605
|
Enforce the `dtypes` dictionary on a DataFrame.
|
558
606
|
|
@@ -617,9 +665,9 @@ def enforce_dtypes(
|
|
617
665
|
]
|
618
666
|
df_numeric_cols = get_numeric_cols(df)
|
619
667
|
if debug:
|
620
|
-
dprint(
|
668
|
+
dprint("Desired data types:")
|
621
669
|
pprint(dtypes)
|
622
|
-
dprint(
|
670
|
+
dprint("Data types for incoming DataFrame:")
|
623
671
|
pprint(df_dtypes)
|
624
672
|
|
625
673
|
if json_cols and len(df) > 0:
|
@@ -685,7 +733,7 @@ def enforce_dtypes(
|
|
685
733
|
if debug:
|
686
734
|
dprint(
|
687
735
|
"The incoming DataFrame has mostly the same types, skipping enforcement."
|
688
|
-
+
|
736
|
+
+ "The only detected difference was in the following datetime columns.\n"
|
689
737
|
+ " Timezone information may be stripped."
|
690
738
|
)
|
691
739
|
pprint(detected_dt_cols)
|
@@ -721,23 +769,23 @@ def enforce_dtypes(
|
|
721
769
|
if 'int' in str(t).lower():
|
722
770
|
try:
|
723
771
|
df[d] = df[d].astype('float64').astype(t)
|
724
|
-
except Exception
|
772
|
+
except Exception:
|
725
773
|
if debug:
|
726
774
|
dprint(f"Was unable to convert to float then {t}.")
|
727
775
|
return df
|
728
776
|
|
729
777
|
|
730
778
|
def get_datetime_bound_from_df(
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
779
|
+
df: Union['pd.DataFrame', Dict[str, List[Any]], List[Dict[str, Any]]],
|
780
|
+
datetime_column: str,
|
781
|
+
minimum: bool = True,
|
782
|
+
) -> Union[int, datetime, None]:
|
735
783
|
"""
|
736
784
|
Return the minimum or maximum datetime (or integer) from a DataFrame.
|
737
785
|
|
738
786
|
Parameters
|
739
787
|
----------
|
740
|
-
df: pd.DataFrame
|
788
|
+
df: Union['pd.DataFrame', Dict[str, List[Any]], List[Dict[str, Any]]]
|
741
789
|
The DataFrame, list, or dict which contains the range axis.
|
742
790
|
|
743
791
|
datetime_column: str
|
@@ -780,17 +828,74 @@ def get_datetime_bound_from_df(
|
|
780
828
|
return best_yet
|
781
829
|
|
782
830
|
if 'DataFrame' in str(type(df)):
|
831
|
+
from meerschaum.utils.dtypes import are_dtypes_equal
|
832
|
+
pandas = mrsm.attempt_import('pandas')
|
833
|
+
is_dask = 'dask' in df.__module__
|
834
|
+
|
783
835
|
if datetime_column not in df.columns:
|
784
836
|
return None
|
785
|
-
|
837
|
+
|
838
|
+
dt_val = (
|
786
839
|
df[datetime_column].min(skipna=True)
|
787
|
-
if minimum
|
788
|
-
|
840
|
+
if minimum else df[datetime_column].max(skipna=True)
|
841
|
+
)
|
842
|
+
if is_dask and dt_val is not None:
|
843
|
+
dt_val = dt_val.compute()
|
844
|
+
|
845
|
+
return (
|
846
|
+
pandas.to_datetime(dt_val).to_pydatetime()
|
847
|
+
if are_dtypes_equal(str(type(dt_val)), 'datetime')
|
848
|
+
else (dt_val if dt_val is not pandas.NA else None)
|
789
849
|
)
|
790
850
|
|
791
851
|
return None
|
792
852
|
|
793
853
|
|
854
|
+
def get_unique_index_values(
|
855
|
+
df: Union['pd.DataFrame', Dict[str, List[Any]], List[Dict[str, Any]]],
|
856
|
+
indices: List[str],
|
857
|
+
) -> Dict[str, List[Any]]:
|
858
|
+
"""
|
859
|
+
Return a dictionary of the unique index values in a DataFrame.
|
860
|
+
|
861
|
+
Parameters
|
862
|
+
----------
|
863
|
+
df: Union['pd.DataFrame', Dict[str, List[Any]], List[Dict[str, Any]]]
|
864
|
+
The dataframe (or list or dict) which contains index values.
|
865
|
+
|
866
|
+
indices: List[str]
|
867
|
+
The list of index columns.
|
868
|
+
|
869
|
+
Returns
|
870
|
+
-------
|
871
|
+
A dictionary mapping indices to unique values.
|
872
|
+
"""
|
873
|
+
if 'dataframe' in str(type(df)).lower():
|
874
|
+
pandas = mrsm.attempt_import('pandas')
|
875
|
+
return {
|
876
|
+
col: list({
|
877
|
+
(val if val is not pandas.NA else None)
|
878
|
+
for val in df[col].unique()
|
879
|
+
})
|
880
|
+
for col in indices
|
881
|
+
if col in df.columns
|
882
|
+
}
|
883
|
+
|
884
|
+
unique_indices = defaultdict(lambda: set())
|
885
|
+
if isinstance(df, list):
|
886
|
+
for doc in df:
|
887
|
+
for index in indices:
|
888
|
+
if index in doc:
|
889
|
+
unique_indices[index].add(doc[index])
|
890
|
+
|
891
|
+
elif isinstance(df, dict):
|
892
|
+
for index in indices:
|
893
|
+
if index in df:
|
894
|
+
unique_indices[index] = unique_indices[index].union(set(df[index]))
|
895
|
+
|
896
|
+
return {key: list(val) for key, val in unique_indices.items()}
|
897
|
+
|
898
|
+
|
794
899
|
def df_is_chunk_generator(df: Any) -> bool:
|
795
900
|
"""
|
796
901
|
Determine whether to treat `df` as a chunk generator.
|
@@ -826,10 +931,10 @@ def chunksize_to_npartitions(chunksize: Optional[int]) -> int:
|
|
826
931
|
|
827
932
|
|
828
933
|
def df_from_literal(
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
934
|
+
pipe: Optional[mrsm.Pipe] = None,
|
935
|
+
literal: str = None,
|
936
|
+
debug: bool = False
|
937
|
+
) -> 'pd.DataFrame':
|
833
938
|
"""
|
834
939
|
Construct a dataframe from a literal value, using the pipe's datetime and value column names.
|
835
940
|
|
@@ -889,17 +994,18 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
|
|
889
994
|
|
890
995
|
|
891
996
|
def query_df(
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
997
|
+
df: 'pd.DataFrame',
|
998
|
+
params: Optional[Dict[str, Any]] = None,
|
999
|
+
begin: Union[datetime, int, None] = None,
|
1000
|
+
end: Union[datetime, int, None] = None,
|
1001
|
+
datetime_column: Optional[str] = None,
|
1002
|
+
select_columns: Optional[List[str]] = None,
|
1003
|
+
omit_columns: Optional[List[str]] = None,
|
1004
|
+
inplace: bool = False,
|
1005
|
+
reset_index: bool = False,
|
1006
|
+
coerce_types: bool = False,
|
1007
|
+
debug: bool = False,
|
1008
|
+
) -> 'pd.DataFrame':
|
903
1009
|
"""
|
904
1010
|
Query the dataframe with the params dictionary.
|
905
1011
|
|
@@ -931,25 +1037,75 @@ def query_df(
|
|
931
1037
|
inplace: bool, default False
|
932
1038
|
If `True`, modify the DataFrame inplace rather than creating a new DataFrame.
|
933
1039
|
|
934
|
-
reset_index: bool, default
|
1040
|
+
reset_index: bool, default False
|
935
1041
|
If `True`, reset the index in the resulting DataFrame.
|
936
1042
|
|
1043
|
+
coerce_types: bool, default False
|
1044
|
+
If `True`, cast the dataframe and parameters as strings before querying.
|
1045
|
+
|
937
1046
|
Returns
|
938
1047
|
-------
|
939
1048
|
A Pandas DataFrame query result.
|
940
1049
|
"""
|
1050
|
+
|
1051
|
+
def _process_select_columns(_df):
|
1052
|
+
if not select_columns:
|
1053
|
+
return
|
1054
|
+
for col in list(_df.columns):
|
1055
|
+
if col not in select_columns:
|
1056
|
+
del _df[col]
|
1057
|
+
|
1058
|
+
def _process_omit_columns(_df):
|
1059
|
+
if not omit_columns:
|
1060
|
+
return
|
1061
|
+
for col in list(_df.columns):
|
1062
|
+
if col in omit_columns:
|
1063
|
+
del _df[col]
|
1064
|
+
|
941
1065
|
if not params and not begin and not end:
|
1066
|
+
if not inplace:
|
1067
|
+
df = df.copy()
|
1068
|
+
_process_select_columns(df)
|
1069
|
+
_process_omit_columns(df)
|
942
1070
|
return df
|
943
1071
|
|
944
|
-
import json
|
945
|
-
import meerschaum as mrsm
|
946
1072
|
from meerschaum.utils.debug import dprint
|
947
1073
|
from meerschaum.utils.misc import get_in_ex_params
|
948
1074
|
from meerschaum.utils.warnings import warn
|
1075
|
+
from meerschaum.utils.dtypes import are_dtypes_equal, value_is_null
|
1076
|
+
dateutil_parser = mrsm.attempt_import('dateutil.parser')
|
1077
|
+
pandas = mrsm.attempt_import('pandas')
|
1078
|
+
NA = pandas.NA
|
1079
|
+
|
1080
|
+
if params:
|
1081
|
+
params = params.copy()
|
1082
|
+
for key, val in {k: v for k, v in params.items()}.items():
|
1083
|
+
if isinstance(val, (list, tuple)):
|
1084
|
+
if None in val:
|
1085
|
+
val = [item for item in val if item is not None] + [NA]
|
1086
|
+
params[key] = val
|
1087
|
+
if coerce_types:
|
1088
|
+
params[key] = [str(x) for x in val]
|
1089
|
+
else:
|
1090
|
+
if value_is_null(val):
|
1091
|
+
val = NA
|
1092
|
+
params[key] = NA
|
1093
|
+
if coerce_types:
|
1094
|
+
params[key] = str(val)
|
949
1095
|
|
950
1096
|
dtypes = {col: str(typ) for col, typ in df.dtypes.items()}
|
951
1097
|
|
952
|
-
if
|
1098
|
+
if inplace:
|
1099
|
+
df.fillna(NA, inplace=True)
|
1100
|
+
else:
|
1101
|
+
df = df.fillna(NA)
|
1102
|
+
|
1103
|
+
if isinstance(begin, str):
|
1104
|
+
begin = dateutil_parser.parse(begin)
|
1105
|
+
if isinstance(end, str):
|
1106
|
+
end = dateutil_parser.parse(end)
|
1107
|
+
|
1108
|
+
if begin is not None or end is not None:
|
953
1109
|
if not datetime_column or datetime_column not in df.columns:
|
954
1110
|
warn(
|
955
1111
|
f"The datetime column '{datetime_column}' is not present in the Dataframe, "
|
@@ -960,14 +1116,44 @@ def query_df(
|
|
960
1116
|
if debug:
|
961
1117
|
dprint(f"Querying dataframe:\n{params=} {begin=} {end=} {datetime_column=}")
|
962
1118
|
|
963
|
-
|
1119
|
+
if datetime_column and (begin is not None or end is not None):
|
1120
|
+
if debug:
|
1121
|
+
dprint("Checking for datetime column compatability.")
|
1122
|
+
|
1123
|
+
from meerschaum.utils.dtypes import are_dtypes_equal, coerce_timezone
|
1124
|
+
df_is_dt = are_dtypes_equal(str(df.dtypes[datetime_column]), 'datetime')
|
1125
|
+
begin_is_int = are_dtypes_equal(str(type(begin)), 'int')
|
1126
|
+
end_is_int = are_dtypes_equal(str(type(end)), 'int')
|
1127
|
+
|
1128
|
+
if df_is_dt:
|
1129
|
+
df_tz = (
|
1130
|
+
getattr(df[datetime_column].dt, 'tz', None)
|
1131
|
+
if hasattr(df[datetime_column], 'dt')
|
1132
|
+
else None
|
1133
|
+
)
|
1134
|
+
|
1135
|
+
if begin_is_int:
|
1136
|
+
begin = datetime.fromtimestamp(int(begin), timezone.utc).replace(tzinfo=None)
|
1137
|
+
if debug:
|
1138
|
+
dprint(f"`begin` will be cast to '{begin}'.")
|
1139
|
+
if end_is_int:
|
1140
|
+
end = datetime.fromtimestamp(int(end), timezone.utc).replace(tzinfo=None)
|
1141
|
+
if debug:
|
1142
|
+
dprint(f"`end` will be cast to '{end}'.")
|
1143
|
+
|
1144
|
+
begin_tz = begin.tzinfo if begin is not None else None
|
1145
|
+
end_tz = end.tzinfo if end is not None else None
|
1146
|
+
|
1147
|
+
if begin_tz is not None or end_tz is not None or df_tz is not None:
|
1148
|
+
begin = coerce_timezone(begin)
|
1149
|
+
end = coerce_timezone(end)
|
1150
|
+
if df_tz is not None:
|
1151
|
+
if debug:
|
1152
|
+
dprint(f"Casting column '{datetime_column}' to UTC...")
|
1153
|
+
df[datetime_column] = coerce_timezone(df[datetime_column])
|
1154
|
+
dprint(f"Using datetime bounds:\n{begin=}\n{end=}")
|
964
1155
|
|
965
|
-
|
966
|
-
if isinstance(x, (dict, list, tuple)):
|
967
|
-
return json.dumps(x, sort_keys=True, separators=(',', ':'), default=str)
|
968
|
-
if hasattr(x, 'isoformat'):
|
969
|
-
return x.isoformat()
|
970
|
-
return str(x)
|
1156
|
+
in_ex_params = get_in_ex_params(params)
|
971
1157
|
|
972
1158
|
masks = [
|
973
1159
|
(
|
@@ -984,34 +1170,44 @@ def query_df(
|
|
984
1170
|
masks.extend([
|
985
1171
|
(
|
986
1172
|
(
|
987
|
-
df[col].
|
988
|
-
|
989
|
-
|
990
|
-
for _in_val in in_vals
|
991
|
-
]
|
992
|
-
) if in_vals else True
|
1173
|
+
(df[col] if not coerce_types else df[col].astype(str)).isin(in_vals)
|
1174
|
+
if in_vals
|
1175
|
+
else True
|
993
1176
|
) & (
|
994
|
-
~df[col].
|
995
|
-
|
996
|
-
|
997
|
-
for _ex_val in ex_vals
|
998
|
-
]
|
999
|
-
) if ex_vals else True
|
1177
|
+
~(df[col] if not coerce_types else df[col].astype(str)).isin(ex_vals)
|
1178
|
+
if ex_vals
|
1179
|
+
else True
|
1000
1180
|
)
|
1001
1181
|
)
|
1002
1182
|
for col, (in_vals, ex_vals) in in_ex_params.items()
|
1003
1183
|
if col in df.columns
|
1004
1184
|
])
|
1005
1185
|
query_mask = masks[0]
|
1006
|
-
for mask in masks:
|
1186
|
+
for mask in masks[1:]:
|
1007
1187
|
query_mask = query_mask & mask
|
1008
1188
|
|
1189
|
+
original_cols = df.columns
|
1190
|
+
bool_cols = [
|
1191
|
+
col
|
1192
|
+
for col, typ in df.dtypes.items()
|
1193
|
+
if are_dtypes_equal(str(typ), 'bool')
|
1194
|
+
]
|
1195
|
+
for col in bool_cols:
|
1196
|
+
df[col] = df[col].astype('boolean[pyarrow]')
|
1197
|
+
df['__mrsm_mask'] = query_mask
|
1198
|
+
|
1009
1199
|
if inplace:
|
1010
|
-
df.where(query_mask, inplace=
|
1011
|
-
df.dropna(how='all', inplace=
|
1200
|
+
df.where(query_mask, other=NA, inplace=True)
|
1201
|
+
df.dropna(how='all', inplace=True)
|
1012
1202
|
result_df = df
|
1013
1203
|
else:
|
1014
|
-
result_df = df.where(query_mask
|
1204
|
+
result_df = df.where(query_mask, other=NA)
|
1205
|
+
result_df.dropna(how='all', inplace=True)
|
1206
|
+
|
1207
|
+
if '__mrsm_mask' in df.columns:
|
1208
|
+
del df['__mrsm_mask']
|
1209
|
+
if '__mrsm_mask' in result_df.columns:
|
1210
|
+
del result_df['__mrsm_mask']
|
1015
1211
|
|
1016
1212
|
if reset_index:
|
1017
1213
|
result_df.reset_index(drop=True, inplace=True)
|
@@ -1019,27 +1215,18 @@ def query_df(
|
|
1019
1215
|
result_df = enforce_dtypes(
|
1020
1216
|
result_df,
|
1021
1217
|
dtypes,
|
1022
|
-
safe_copy
|
1023
|
-
debug
|
1024
|
-
coerce_numeric
|
1218
|
+
safe_copy=False,
|
1219
|
+
debug=debug,
|
1220
|
+
coerce_numeric=False,
|
1025
1221
|
)
|
1026
1222
|
|
1027
1223
|
if select_columns == ['*']:
|
1028
1224
|
select_columns = None
|
1029
1225
|
|
1030
1226
|
if not select_columns and not omit_columns:
|
1031
|
-
return result_df
|
1227
|
+
return result_df[original_cols]
|
1032
1228
|
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
del result_df[col]
|
1037
|
-
return result_df
|
1038
|
-
|
1039
|
-
if omit_columns:
|
1040
|
-
for col in list(result_df.columns):
|
1041
|
-
if col in omit_columns:
|
1042
|
-
del result_df[col]
|
1043
|
-
if debug:
|
1044
|
-
dprint(f"{dtypes=}")
|
1229
|
+
_process_select_columns(result_df)
|
1230
|
+
_process_omit_columns(result_df)
|
1231
|
+
|
1045
1232
|
return result_df
|
@@ -7,7 +7,10 @@ Utility functions for working with data types.
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
import traceback
|
10
|
+
from datetime import timezone
|
10
11
|
from decimal import Decimal, Context, InvalidOperation
|
12
|
+
|
13
|
+
import meerschaum as mrsm
|
11
14
|
from meerschaum.utils.typing import Dict, Union, Any
|
12
15
|
from meerschaum.utils.warnings import warn
|
13
16
|
|
@@ -44,20 +47,19 @@ def to_pandas_dtype(dtype: str) -> str:
|
|
44
47
|
|
45
48
|
try:
|
46
49
|
return str(pandas.api.types.pandas_dtype(dtype))
|
47
|
-
except Exception
|
50
|
+
except Exception:
|
48
51
|
warn(
|
49
52
|
f"Invalid dtype '{dtype}', will use 'object' instead:\n"
|
50
53
|
+ f"{traceback.format_exc()}",
|
51
54
|
stack = False,
|
52
|
-
)
|
53
|
-
|
55
|
+
)
|
54
56
|
return 'object'
|
55
57
|
|
56
58
|
|
57
59
|
def are_dtypes_equal(
|
58
|
-
|
59
|
-
|
60
|
-
|
60
|
+
ldtype: Union[str, Dict[str, str]],
|
61
|
+
rdtype: Union[str, Dict[str, str]],
|
62
|
+
) -> bool:
|
61
63
|
"""
|
62
64
|
Determine whether two dtype strings may be considered
|
63
65
|
equivalent to avoid unnecessary conversions.
|
@@ -219,3 +221,26 @@ def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal:
|
|
219
221
|
return x.quantize(precision_decimal, context=Context(prec=scale))
|
220
222
|
except InvalidOperation:
|
221
223
|
return x
|
224
|
+
|
225
|
+
|
226
|
+
def coerce_timezone(dt: Any) -> Any:
|
227
|
+
"""
|
228
|
+
Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
|
229
|
+
return a naive datetime in terms of UTC.
|
230
|
+
"""
|
231
|
+
if dt is None:
|
232
|
+
return None
|
233
|
+
|
234
|
+
if isinstance(dt, int):
|
235
|
+
return dt
|
236
|
+
|
237
|
+
dt_is_series = hasattr(dt, 'dtype')
|
238
|
+
|
239
|
+
if dt_is_series:
|
240
|
+
pandas = mrsm.attempt_import('pandas')
|
241
|
+
return pandas.to_datetime(dt, utc=True).apply(lambda x: x.replace(tzinfo=None))
|
242
|
+
|
243
|
+
if dt.tzinfo is None:
|
244
|
+
return dt
|
245
|
+
|
246
|
+
return dt.astimezone(timezone.utc).replace(tzinfo=None)
|
meerschaum/utils/dtypes/sql.py
CHANGED
@@ -400,10 +400,10 @@ def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) ->
|
|
400
400
|
|
401
401
|
|
402
402
|
def get_db_type_from_pd_type(
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
403
|
+
pd_type: str,
|
404
|
+
flavor: str = 'default',
|
405
|
+
as_sqlalchemy: bool = False,
|
406
|
+
) -> Union[str, 'sqlalchemy.sql.visitors.TraversibleType']:
|
407
407
|
"""
|
408
408
|
Parse a Pandas data type into a flavor's database type.
|
409
409
|
|
meerschaum/utils/misc.py
CHANGED
@@ -1354,9 +1354,9 @@ def truncate_string_sections(item: str, delimeter: str = '_', max_len: int = 128
|
|
1354
1354
|
|
1355
1355
|
|
1356
1356
|
def separate_negation_values(
|
1357
|
-
|
1358
|
-
|
1359
|
-
|
1357
|
+
vals: Union[List[str], Tuple[str]],
|
1358
|
+
negation_prefix: Optional[str] = None,
|
1359
|
+
) -> Tuple[List[str], List[str]]:
|
1360
1360
|
"""
|
1361
1361
|
Separate the negated values from the positive ones.
|
1362
1362
|
Return two lists: positive and negative values.
|