meerschaum 2.3.5.dev0__py3-none-any.whl → 2.4.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. meerschaum/_internal/arguments/__init__.py +2 -1
  2. meerschaum/_internal/arguments/_parse_arguments.py +86 -7
  3. meerschaum/_internal/entry.py +29 -13
  4. meerschaum/actions/api.py +16 -16
  5. meerschaum/actions/bootstrap.py +36 -10
  6. meerschaum/actions/start.py +16 -15
  7. meerschaum/api/_events.py +11 -7
  8. meerschaum/api/dash/__init__.py +7 -6
  9. meerschaum/api/dash/callbacks/__init__.py +1 -0
  10. meerschaum/api/dash/callbacks/dashboard.py +7 -5
  11. meerschaum/api/dash/callbacks/pipes.py +42 -0
  12. meerschaum/api/dash/pages/__init__.py +1 -0
  13. meerschaum/api/dash/pages/pipes.py +16 -0
  14. meerschaum/api/dash/pipes.py +79 -47
  15. meerschaum/api/dash/users.py +19 -6
  16. meerschaum/api/routes/_actions.py +0 -98
  17. meerschaum/api/routes/_jobs.py +38 -18
  18. meerschaum/api/routes/_login.py +4 -4
  19. meerschaum/api/routes/_pipes.py +3 -3
  20. meerschaum/config/_default.py +9 -2
  21. meerschaum/config/_version.py +1 -1
  22. meerschaum/config/stack/__init__.py +59 -18
  23. meerschaum/config/static/__init__.py +2 -0
  24. meerschaum/connectors/Connector.py +19 -13
  25. meerschaum/connectors/__init__.py +9 -5
  26. meerschaum/connectors/api/_actions.py +22 -36
  27. meerschaum/connectors/api/_jobs.py +1 -0
  28. meerschaum/connectors/poll.py +30 -24
  29. meerschaum/connectors/sql/_pipes.py +126 -154
  30. meerschaum/connectors/sql/_plugins.py +45 -43
  31. meerschaum/connectors/sql/_users.py +46 -38
  32. meerschaum/connectors/valkey/ValkeyConnector.py +535 -0
  33. meerschaum/connectors/valkey/__init__.py +8 -0
  34. meerschaum/connectors/valkey/_fetch.py +75 -0
  35. meerschaum/connectors/valkey/_pipes.py +839 -0
  36. meerschaum/connectors/valkey/_plugins.py +265 -0
  37. meerschaum/connectors/valkey/_users.py +305 -0
  38. meerschaum/core/Pipe/__init__.py +2 -0
  39. meerschaum/core/Pipe/_attributes.py +1 -2
  40. meerschaum/core/Pipe/_drop.py +4 -4
  41. meerschaum/core/Pipe/_dtypes.py +14 -14
  42. meerschaum/core/Pipe/_edit.py +15 -14
  43. meerschaum/core/Pipe/_sync.py +134 -51
  44. meerschaum/core/User/_User.py +14 -12
  45. meerschaum/jobs/_Job.py +26 -8
  46. meerschaum/jobs/systemd.py +20 -8
  47. meerschaum/plugins/_Plugin.py +17 -13
  48. meerschaum/utils/_get_pipes.py +14 -20
  49. meerschaum/utils/dataframe.py +288 -101
  50. meerschaum/utils/dtypes/__init__.py +31 -6
  51. meerschaum/utils/dtypes/sql.py +4 -4
  52. meerschaum/utils/misc.py +3 -3
  53. meerschaum/utils/packages/_packages.py +1 -0
  54. meerschaum/utils/prompt.py +1 -1
  55. {meerschaum-2.3.5.dev0.dist-info → meerschaum-2.4.0.dev0.dist-info}/METADATA +3 -1
  56. {meerschaum-2.3.5.dev0.dist-info → meerschaum-2.4.0.dev0.dist-info}/RECORD +62 -54
  57. {meerschaum-2.3.5.dev0.dist-info → meerschaum-2.4.0.dev0.dist-info}/WHEEL +1 -1
  58. {meerschaum-2.3.5.dev0.dist-info → meerschaum-2.4.0.dev0.dist-info}/LICENSE +0 -0
  59. {meerschaum-2.3.5.dev0.dist-info → meerschaum-2.4.0.dev0.dist-info}/NOTICE +0 -0
  60. {meerschaum-2.3.5.dev0.dist-info → meerschaum-2.4.0.dev0.dist-info}/entry_points.txt +0 -0
  61. {meerschaum-2.3.5.dev0.dist-info → meerschaum-2.4.0.dev0.dist-info}/top_level.txt +0 -0
  62. {meerschaum-2.3.5.dev0.dist-info → meerschaum-2.4.0.dev0.dist-info}/zip-safe +0 -0
@@ -16,19 +16,18 @@ from meerschaum.utils.typing import (
16
16
  __pdoc__ = {'get_pipes': True, 'fetch_pipes_keys': True}
17
17
 
18
18
  def get_pipes(
19
- connector_keys: Union[str, List[str], None] = None,
20
- metric_keys: Union[str, List[str], None] = None,
21
- location_keys: Union[str, List[str], None] = None,
22
- tags: Optional[List[str]] = None,
23
- params: Optional[Dict[str, Any]] = None,
24
- mrsm_instance: Union[str, InstanceConnector, None] = None,
25
- instance: Union[str, InstanceConnector, None] = None,
26
- as_list: bool = False,
27
- method: str = 'registered',
28
- wait: bool = False,
29
- debug: bool = False,
30
- **kw: Any
31
- ) -> Union[PipesDict, List[mrsm.Pipe]]:
19
+ connector_keys: Union[str, List[str], None] = None,
20
+ metric_keys: Union[str, List[str], None] = None,
21
+ location_keys: Union[str, List[str], None] = None,
22
+ tags: Optional[List[str]] = None,
23
+ params: Optional[Dict[str, Any]] = None,
24
+ mrsm_instance: Union[str, InstanceConnector, None] = None,
25
+ instance: Union[str, InstanceConnector, None] = None,
26
+ as_list: bool = False,
27
+ method: str = 'registered',
28
+ debug: bool = False,
29
+ **kw: Any
30
+ ) -> Union[PipesDict, List[mrsm.Pipe]]:
32
31
  """
33
32
  Return a dictionary or list of `meerschaum.Pipe` objects.
34
33
 
@@ -72,10 +71,6 @@ def get_pipes(
72
71
  If `'all'`, create pipes from predefined metrics and locations. Required `connector_keys`.
73
72
  **NOTE:** Method `'all'` is not implemented!
74
73
 
75
- wait: bool, default False
76
- Wait for a connection before getting Pipes. Should only be true for cases where the
77
- database might not be running (like the API).
78
-
79
74
  **kw: Any:
80
75
  Keyword arguments to pass to the `meerschaum.Pipe` constructor.
81
76
 
@@ -133,15 +128,14 @@ def get_pipes(
133
128
  location_keys = [location_keys]
134
129
 
135
130
  ### Get SQL or API connector (keys come from `connector.fetch_pipes_keys()`).
136
- ### If `wait`, wait until a connection is made
137
131
  if mrsm_instance is None:
138
132
  mrsm_instance = instance
139
133
  if mrsm_instance is None:
140
134
  mrsm_instance = get_config('meerschaum', 'instance', patch=True)
141
135
  if isinstance(mrsm_instance, str):
142
136
  from meerschaum.connectors.parse import parse_instance_keys
143
- connector = parse_instance_keys(keys=mrsm_instance, wait=wait, debug=debug)
144
- else: ### NOTE: mrsm_instance MUST be a SQL or API connector for this to work
137
+ connector = parse_instance_keys(keys=mrsm_instance, debug=debug)
138
+ else:
145
139
  from meerschaum.connectors import instance_types
146
140
  valid_connector = False
147
141
  if hasattr(mrsm_instance, 'type'):
@@ -7,7 +7,8 @@ Utility functions for working with DataFrames.
7
7
  """
8
8
 
9
9
  from __future__ import annotations
10
- from datetime import datetime
10
+ from datetime import datetime, timezone
11
+ from collections import defaultdict
11
12
 
12
13
  import meerschaum as mrsm
13
14
  from meerschaum.utils.typing import (
@@ -78,12 +79,13 @@ def add_missing_cols_to_df(
78
79
 
79
80
 
80
81
  def filter_unseen_df(
81
- old_df: 'pd.DataFrame',
82
- new_df: 'pd.DataFrame',
83
- safe_copy: bool = True,
84
- dtypes: Optional[Dict[str, Any]] = None,
85
- debug: bool = False,
86
- ) -> 'pd.DataFrame':
82
+ old_df: 'pd.DataFrame',
83
+ new_df: 'pd.DataFrame',
84
+ safe_copy: bool = True,
85
+ dtypes: Optional[Dict[str, Any]] = None,
86
+ include_unchanged_columns: bool = False,
87
+ debug: bool = False,
88
+ ) -> 'pd.DataFrame':
87
89
  """
88
90
  Left join two DataFrames to find the newest unseen data.
89
91
 
@@ -102,6 +104,9 @@ def filter_unseen_df(
102
104
  dtypes: Optional[Dict[str, Any]], default None
103
105
  Optionally specify the datatypes of the dataframe.
104
106
 
107
+ include_unchanged_columns: bool, default False
108
+ If `True`, include columns which haven't changed on rows which have changed.
109
+
105
110
  debug: bool, default False
106
111
  Verbosity toggle.
107
112
 
@@ -135,8 +140,12 @@ def filter_unseen_df(
135
140
  from decimal import Decimal
136
141
  from meerschaum.utils.warnings import warn
137
142
  from meerschaum.utils.packages import import_pandas, attempt_import
138
- from meerschaum.utils.dtypes import to_pandas_dtype, are_dtypes_equal, attempt_cast_to_numeric
139
- from meerschaum.utils.debug import dprint
143
+ from meerschaum.utils.dtypes import (
144
+ to_pandas_dtype,
145
+ are_dtypes_equal,
146
+ attempt_cast_to_numeric,
147
+ coerce_timezone,
148
+ )
140
149
  pd = import_pandas(debug=debug)
141
150
  is_dask = 'dask' in new_df.__module__
142
151
  if is_dask:
@@ -156,6 +165,19 @@ def filter_unseen_df(
156
165
  new_df = add_missing_cols_to_df(new_df, old_df_dtypes)
157
166
  old_df = add_missing_cols_to_df(old_df, new_df_dtypes)
158
167
 
168
+ new_types_missing_from_old = {
169
+ col: typ
170
+ for col, typ in new_df_dtypes.items()
171
+ if col not in old_df_dtypes
172
+ }
173
+ old_types_missing_from_new = {
174
+ col: typ
175
+ for col, typ in new_df_dtypes.items()
176
+ if col not in old_df_dtypes
177
+ }
178
+ old_df_dtypes.update(new_types_missing_from_old)
179
+ new_df_dtypes.update(old_types_missing_from_new)
180
+
159
181
  ### Edge case: two empty lists cast to DFs.
160
182
  elif len(new_df.columns) == 0:
161
183
  return new_df
@@ -163,6 +185,7 @@ def filter_unseen_df(
163
185
  try:
164
186
  ### Order matters when checking equality.
165
187
  new_df = new_df[old_df.columns]
188
+
166
189
  except Exception as e:
167
190
  warn(
168
191
  "Was not able to cast old columns onto new DataFrame. " +
@@ -183,16 +206,38 @@ def filter_unseen_df(
183
206
  for col, typ in new_df_dtypes.items():
184
207
  if col not in dtypes:
185
208
  dtypes[col] = typ
186
-
187
- cast_cols = True
209
+
210
+ dt_dtypes = {
211
+ col: typ
212
+ for col, typ in dtypes.items()
213
+ if are_dtypes_equal(typ, 'datetime')
214
+ }
215
+ non_dt_dtypes = {
216
+ col: typ
217
+ for col, typ in dtypes.items()
218
+ if col not in dt_dtypes
219
+ }
220
+
221
+ cast_non_dt_cols = True
188
222
  try:
189
- new_df = new_df.astype(dtypes)
190
- cast_cols = False
223
+ new_df = new_df.astype(non_dt_dtypes)
224
+ cast_non_dt_cols = False
191
225
  except Exception as e:
192
226
  warn(
193
227
  f"Was not able to cast the new DataFrame to the given dtypes.\n{e}"
194
228
  )
195
229
 
230
+ cast_dt_cols = True
231
+ try:
232
+ for col, typ in dt_dtypes.items():
233
+ tz = typ.split(',')[-1].strip() if ',' in typ else None
234
+ new_df[col] = coerce_timezone(pd.to_datetime(new_df[col], utc=True))
235
+ cast_dt_cols = False
236
+ except Exception as e:
237
+ warn(f"Could not cast datetime columns:\n{e}")
238
+
239
+ cast_cols = cast_dt_cols or cast_non_dt_cols
240
+
196
241
  new_numeric_cols_existing = get_numeric_cols(new_df)
197
242
  old_numeric_cols = get_numeric_cols(old_df)
198
243
  for col, typ in {k: v for k, v in dtypes.items()}.items():
@@ -257,19 +302,20 @@ def filter_unseen_df(
257
302
  joined_df = merge(
258
303
  new_df.fillna(NA),
259
304
  old_df.fillna(NA),
260
- how = 'left',
261
- on = None,
262
- indicator = True,
305
+ how='left',
306
+ on=None,
307
+ indicator=True,
263
308
  )
264
309
  changed_rows_mask = (joined_df['_merge'] == 'left_only')
265
- delta_df = joined_df[list(new_df_dtypes.keys())][changed_rows_mask].reset_index(drop=True)
310
+ new_cols = list(new_df_dtypes)
311
+ delta_df = joined_df[new_cols][changed_rows_mask].reset_index(drop=True)
266
312
 
267
313
  for json_col in json_cols:
268
314
  if json_col not in delta_df.columns:
269
315
  continue
270
316
  try:
271
317
  delta_df[json_col] = delta_df[json_col].apply(json.loads)
272
- except Exception as e:
318
+ except Exception:
273
319
  warn(f"Unable to deserialize JSON column '{json_col}':\n{traceback.format_exc()}")
274
320
 
275
321
  for numeric_col in numeric_cols:
@@ -277,19 +323,19 @@ def filter_unseen_df(
277
323
  continue
278
324
  try:
279
325
  delta_df[numeric_col] = delta_df[numeric_col].apply(attempt_cast_to_numeric)
280
- except Exception as e:
326
+ except Exception:
281
327
  warn(f"Unable to parse numeric column '{numeric_col}':\n{traceback.format_exc()}")
282
328
 
283
329
  return delta_df
284
330
 
285
331
 
286
332
  def parse_df_datetimes(
287
- df: 'pd.DataFrame',
288
- ignore_cols: Optional[Iterable[str]] = None,
289
- chunksize: Optional[int] = None,
290
- dtype_backend: str = 'numpy_nullable',
291
- debug: bool = False,
292
- ) -> 'pd.DataFrame':
333
+ df: 'pd.DataFrame',
334
+ ignore_cols: Optional[Iterable[str]] = None,
335
+ chunksize: Optional[int] = None,
336
+ dtype_backend: str = 'numpy_nullable',
337
+ debug: bool = False,
338
+ ) -> 'pd.DataFrame':
293
339
  """
294
340
  Parse a pandas DataFrame for datetime columns and cast as datetimes.
295
341
 
@@ -461,6 +507,8 @@ def get_unhashable_cols(df: 'pd.DataFrame') -> List[str]:
461
507
  -------
462
508
  A list of columns.
463
509
  """
510
+ if df is None:
511
+ return []
464
512
  if len(df) == 0:
465
513
  return []
466
514
 
@@ -547,12 +595,12 @@ def get_numeric_cols(df: 'pd.DataFrame') -> List[str]:
547
595
 
548
596
 
549
597
  def enforce_dtypes(
550
- df: 'pd.DataFrame',
551
- dtypes: Dict[str, str],
552
- safe_copy: bool = True,
553
- coerce_numeric: bool = True,
554
- debug: bool = False,
555
- ) -> 'pd.DataFrame':
598
+ df: 'pd.DataFrame',
599
+ dtypes: Dict[str, str],
600
+ safe_copy: bool = True,
601
+ coerce_numeric: bool = True,
602
+ debug: bool = False,
603
+ ) -> 'pd.DataFrame':
556
604
  """
557
605
  Enforce the `dtypes` dictionary on a DataFrame.
558
606
 
@@ -617,9 +665,9 @@ def enforce_dtypes(
617
665
  ]
618
666
  df_numeric_cols = get_numeric_cols(df)
619
667
  if debug:
620
- dprint(f"Desired data types:")
668
+ dprint("Desired data types:")
621
669
  pprint(dtypes)
622
- dprint(f"Data types for incoming DataFrame:")
670
+ dprint("Data types for incoming DataFrame:")
623
671
  pprint(df_dtypes)
624
672
 
625
673
  if json_cols and len(df) > 0:
@@ -685,7 +733,7 @@ def enforce_dtypes(
685
733
  if debug:
686
734
  dprint(
687
735
  "The incoming DataFrame has mostly the same types, skipping enforcement."
688
- + f"The only detected difference was in the following datetime columns.\n"
736
+ + "The only detected difference was in the following datetime columns.\n"
689
737
  + " Timezone information may be stripped."
690
738
  )
691
739
  pprint(detected_dt_cols)
@@ -721,23 +769,23 @@ def enforce_dtypes(
721
769
  if 'int' in str(t).lower():
722
770
  try:
723
771
  df[d] = df[d].astype('float64').astype(t)
724
- except Exception as e:
772
+ except Exception:
725
773
  if debug:
726
774
  dprint(f"Was unable to convert to float then {t}.")
727
775
  return df
728
776
 
729
777
 
730
778
  def get_datetime_bound_from_df(
731
- df: Union['pd.DataFrame', dict, list],
732
- datetime_column: str,
733
- minimum: bool = True,
734
- ) -> Union[int, datetime, None]:
779
+ df: Union['pd.DataFrame', Dict[str, List[Any]], List[Dict[str, Any]]],
780
+ datetime_column: str,
781
+ minimum: bool = True,
782
+ ) -> Union[int, datetime, None]:
735
783
  """
736
784
  Return the minimum or maximum datetime (or integer) from a DataFrame.
737
785
 
738
786
  Parameters
739
787
  ----------
740
- df: pd.DataFrame
788
+ df: Union['pd.DataFrame', Dict[str, List[Any]], List[Dict[str, Any]]]
741
789
  The DataFrame, list, or dict which contains the range axis.
742
790
 
743
791
  datetime_column: str
@@ -780,17 +828,74 @@ def get_datetime_bound_from_df(
780
828
  return best_yet
781
829
 
782
830
  if 'DataFrame' in str(type(df)):
831
+ from meerschaum.utils.dtypes import are_dtypes_equal
832
+ pandas = mrsm.attempt_import('pandas')
833
+ is_dask = 'dask' in df.__module__
834
+
783
835
  if datetime_column not in df.columns:
784
836
  return None
785
- return (
837
+
838
+ dt_val = (
786
839
  df[datetime_column].min(skipna=True)
787
- if minimum
788
- else df[datetime_column].max(skipna=True)
840
+ if minimum else df[datetime_column].max(skipna=True)
841
+ )
842
+ if is_dask and dt_val is not None:
843
+ dt_val = dt_val.compute()
844
+
845
+ return (
846
+ pandas.to_datetime(dt_val).to_pydatetime()
847
+ if are_dtypes_equal(str(type(dt_val)), 'datetime')
848
+ else (dt_val if dt_val is not pandas.NA else None)
789
849
  )
790
850
 
791
851
  return None
792
852
 
793
853
 
854
+ def get_unique_index_values(
855
+ df: Union['pd.DataFrame', Dict[str, List[Any]], List[Dict[str, Any]]],
856
+ indices: List[str],
857
+ ) -> Dict[str, List[Any]]:
858
+ """
859
+ Return a dictionary of the unique index values in a DataFrame.
860
+
861
+ Parameters
862
+ ----------
863
+ df: Union['pd.DataFrame', Dict[str, List[Any]], List[Dict[str, Any]]]
864
+ The dataframe (or list or dict) which contains index values.
865
+
866
+ indices: List[str]
867
+ The list of index columns.
868
+
869
+ Returns
870
+ -------
871
+ A dictionary mapping indices to unique values.
872
+ """
873
+ if 'dataframe' in str(type(df)).lower():
874
+ pandas = mrsm.attempt_import('pandas')
875
+ return {
876
+ col: list({
877
+ (val if val is not pandas.NA else None)
878
+ for val in df[col].unique()
879
+ })
880
+ for col in indices
881
+ if col in df.columns
882
+ }
883
+
884
+ unique_indices = defaultdict(lambda: set())
885
+ if isinstance(df, list):
886
+ for doc in df:
887
+ for index in indices:
888
+ if index in doc:
889
+ unique_indices[index].add(doc[index])
890
+
891
+ elif isinstance(df, dict):
892
+ for index in indices:
893
+ if index in df:
894
+ unique_indices[index] = unique_indices[index].union(set(df[index]))
895
+
896
+ return {key: list(val) for key, val in unique_indices.items()}
897
+
898
+
794
899
  def df_is_chunk_generator(df: Any) -> bool:
795
900
  """
796
901
  Determine whether to treat `df` as a chunk generator.
@@ -826,10 +931,10 @@ def chunksize_to_npartitions(chunksize: Optional[int]) -> int:
826
931
 
827
932
 
828
933
  def df_from_literal(
829
- pipe: Optional[mrsm.Pipe] = None,
830
- literal: str = None,
831
- debug: bool = False
832
- ) -> 'pd.DataFrame':
934
+ pipe: Optional[mrsm.Pipe] = None,
935
+ literal: str = None,
936
+ debug: bool = False
937
+ ) -> 'pd.DataFrame':
833
938
  """
834
939
  Construct a dataframe from a literal value, using the pipe's datetime and value column names.
835
940
 
@@ -889,17 +994,18 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
889
994
 
890
995
 
891
996
  def query_df(
892
- df: 'pd.DataFrame',
893
- params: Optional[Dict[str, Any]] = None,
894
- begin: Union[datetime, int, None] = None,
895
- end: Union[datetime, int, None] = None,
896
- datetime_column: Optional[str] = None,
897
- select_columns: Optional[List[str]] = None,
898
- omit_columns: Optional[List[str]] = None,
899
- inplace: bool = False,
900
- reset_index: bool = False,
901
- debug: bool = False,
902
- ) -> 'pd.DataFrame':
997
+ df: 'pd.DataFrame',
998
+ params: Optional[Dict[str, Any]] = None,
999
+ begin: Union[datetime, int, None] = None,
1000
+ end: Union[datetime, int, None] = None,
1001
+ datetime_column: Optional[str] = None,
1002
+ select_columns: Optional[List[str]] = None,
1003
+ omit_columns: Optional[List[str]] = None,
1004
+ inplace: bool = False,
1005
+ reset_index: bool = False,
1006
+ coerce_types: bool = False,
1007
+ debug: bool = False,
1008
+ ) -> 'pd.DataFrame':
903
1009
  """
904
1010
  Query the dataframe with the params dictionary.
905
1011
 
@@ -931,25 +1037,75 @@ def query_df(
931
1037
  inplace: bool, default False
932
1038
  If `True`, modify the DataFrame inplace rather than creating a new DataFrame.
933
1039
 
934
- reset_index: bool, default True
1040
+ reset_index: bool, default False
935
1041
  If `True`, reset the index in the resulting DataFrame.
936
1042
 
1043
+ coerce_types: bool, default False
1044
+ If `True`, cast the dataframe and parameters as strings before querying.
1045
+
937
1046
  Returns
938
1047
  -------
939
1048
  A Pandas DataFrame query result.
940
1049
  """
1050
+
1051
+ def _process_select_columns(_df):
1052
+ if not select_columns:
1053
+ return
1054
+ for col in list(_df.columns):
1055
+ if col not in select_columns:
1056
+ del _df[col]
1057
+
1058
+ def _process_omit_columns(_df):
1059
+ if not omit_columns:
1060
+ return
1061
+ for col in list(_df.columns):
1062
+ if col in omit_columns:
1063
+ del _df[col]
1064
+
941
1065
  if not params and not begin and not end:
1066
+ if not inplace:
1067
+ df = df.copy()
1068
+ _process_select_columns(df)
1069
+ _process_omit_columns(df)
942
1070
  return df
943
1071
 
944
- import json
945
- import meerschaum as mrsm
946
1072
  from meerschaum.utils.debug import dprint
947
1073
  from meerschaum.utils.misc import get_in_ex_params
948
1074
  from meerschaum.utils.warnings import warn
1075
+ from meerschaum.utils.dtypes import are_dtypes_equal, value_is_null
1076
+ dateutil_parser = mrsm.attempt_import('dateutil.parser')
1077
+ pandas = mrsm.attempt_import('pandas')
1078
+ NA = pandas.NA
1079
+
1080
+ if params:
1081
+ params = params.copy()
1082
+ for key, val in {k: v for k, v in params.items()}.items():
1083
+ if isinstance(val, (list, tuple)):
1084
+ if None in val:
1085
+ val = [item for item in val if item is not None] + [NA]
1086
+ params[key] = val
1087
+ if coerce_types:
1088
+ params[key] = [str(x) for x in val]
1089
+ else:
1090
+ if value_is_null(val):
1091
+ val = NA
1092
+ params[key] = NA
1093
+ if coerce_types:
1094
+ params[key] = str(val)
949
1095
 
950
1096
  dtypes = {col: str(typ) for col, typ in df.dtypes.items()}
951
1097
 
952
- if begin or end:
1098
+ if inplace:
1099
+ df.fillna(NA, inplace=True)
1100
+ else:
1101
+ df = df.fillna(NA)
1102
+
1103
+ if isinstance(begin, str):
1104
+ begin = dateutil_parser.parse(begin)
1105
+ if isinstance(end, str):
1106
+ end = dateutil_parser.parse(end)
1107
+
1108
+ if begin is not None or end is not None:
953
1109
  if not datetime_column or datetime_column not in df.columns:
954
1110
  warn(
955
1111
  f"The datetime column '{datetime_column}' is not present in the Dataframe, "
@@ -960,14 +1116,44 @@ def query_df(
960
1116
  if debug:
961
1117
  dprint(f"Querying dataframe:\n{params=} {begin=} {end=} {datetime_column=}")
962
1118
 
963
- in_ex_params = get_in_ex_params(params)
1119
+ if datetime_column and (begin is not None or end is not None):
1120
+ if debug:
1121
+ dprint("Checking for datetime column compatability.")
1122
+
1123
+ from meerschaum.utils.dtypes import are_dtypes_equal, coerce_timezone
1124
+ df_is_dt = are_dtypes_equal(str(df.dtypes[datetime_column]), 'datetime')
1125
+ begin_is_int = are_dtypes_equal(str(type(begin)), 'int')
1126
+ end_is_int = are_dtypes_equal(str(type(end)), 'int')
1127
+
1128
+ if df_is_dt:
1129
+ df_tz = (
1130
+ getattr(df[datetime_column].dt, 'tz', None)
1131
+ if hasattr(df[datetime_column], 'dt')
1132
+ else None
1133
+ )
1134
+
1135
+ if begin_is_int:
1136
+ begin = datetime.fromtimestamp(int(begin), timezone.utc).replace(tzinfo=None)
1137
+ if debug:
1138
+ dprint(f"`begin` will be cast to '{begin}'.")
1139
+ if end_is_int:
1140
+ end = datetime.fromtimestamp(int(end), timezone.utc).replace(tzinfo=None)
1141
+ if debug:
1142
+ dprint(f"`end` will be cast to '{end}'.")
1143
+
1144
+ begin_tz = begin.tzinfo if begin is not None else None
1145
+ end_tz = end.tzinfo if end is not None else None
1146
+
1147
+ if begin_tz is not None or end_tz is not None or df_tz is not None:
1148
+ begin = coerce_timezone(begin)
1149
+ end = coerce_timezone(end)
1150
+ if df_tz is not None:
1151
+ if debug:
1152
+ dprint(f"Casting column '{datetime_column}' to UTC...")
1153
+ df[datetime_column] = coerce_timezone(df[datetime_column])
1154
+ dprint(f"Using datetime bounds:\n{begin=}\n{end=}")
964
1155
 
965
- def serialize(x: Any) -> str:
966
- if isinstance(x, (dict, list, tuple)):
967
- return json.dumps(x, sort_keys=True, separators=(',', ':'), default=str)
968
- if hasattr(x, 'isoformat'):
969
- return x.isoformat()
970
- return str(x)
1156
+ in_ex_params = get_in_ex_params(params)
971
1157
 
972
1158
  masks = [
973
1159
  (
@@ -984,34 +1170,44 @@ def query_df(
984
1170
  masks.extend([
985
1171
  (
986
1172
  (
987
- df[col].apply(serialize).isin(
988
- [
989
- serialize(_in_val)
990
- for _in_val in in_vals
991
- ]
992
- ) if in_vals else True
1173
+ (df[col] if not coerce_types else df[col].astype(str)).isin(in_vals)
1174
+ if in_vals
1175
+ else True
993
1176
  ) & (
994
- ~df[col].apply(serialize).isin(
995
- [
996
- serialize(_ex_val)
997
- for _ex_val in ex_vals
998
- ]
999
- ) if ex_vals else True
1177
+ ~(df[col] if not coerce_types else df[col].astype(str)).isin(ex_vals)
1178
+ if ex_vals
1179
+ else True
1000
1180
  )
1001
1181
  )
1002
1182
  for col, (in_vals, ex_vals) in in_ex_params.items()
1003
1183
  if col in df.columns
1004
1184
  ])
1005
1185
  query_mask = masks[0]
1006
- for mask in masks:
1186
+ for mask in masks[1:]:
1007
1187
  query_mask = query_mask & mask
1008
1188
 
1189
+ original_cols = df.columns
1190
+ bool_cols = [
1191
+ col
1192
+ for col, typ in df.dtypes.items()
1193
+ if are_dtypes_equal(str(typ), 'bool')
1194
+ ]
1195
+ for col in bool_cols:
1196
+ df[col] = df[col].astype('boolean[pyarrow]')
1197
+ df['__mrsm_mask'] = query_mask
1198
+
1009
1199
  if inplace:
1010
- df.where(query_mask, inplace=inplace)
1011
- df.dropna(how='all', inplace=inplace)
1200
+ df.where(query_mask, other=NA, inplace=True)
1201
+ df.dropna(how='all', inplace=True)
1012
1202
  result_df = df
1013
1203
  else:
1014
- result_df = df.where(query_mask).dropna(how='all')
1204
+ result_df = df.where(query_mask, other=NA)
1205
+ result_df.dropna(how='all', inplace=True)
1206
+
1207
+ if '__mrsm_mask' in df.columns:
1208
+ del df['__mrsm_mask']
1209
+ if '__mrsm_mask' in result_df.columns:
1210
+ del result_df['__mrsm_mask']
1015
1211
 
1016
1212
  if reset_index:
1017
1213
  result_df.reset_index(drop=True, inplace=True)
@@ -1019,27 +1215,18 @@ def query_df(
1019
1215
  result_df = enforce_dtypes(
1020
1216
  result_df,
1021
1217
  dtypes,
1022
- safe_copy = (not inplace),
1023
- debug = debug,
1024
- coerce_numeric = False,
1218
+ safe_copy=False,
1219
+ debug=debug,
1220
+ coerce_numeric=False,
1025
1221
  )
1026
1222
 
1027
1223
  if select_columns == ['*']:
1028
1224
  select_columns = None
1029
1225
 
1030
1226
  if not select_columns and not omit_columns:
1031
- return result_df
1227
+ return result_df[original_cols]
1032
1228
 
1033
- if select_columns:
1034
- for col in list(result_df.columns):
1035
- if col not in select_columns:
1036
- del result_df[col]
1037
- return result_df
1038
-
1039
- if omit_columns:
1040
- for col in list(result_df.columns):
1041
- if col in omit_columns:
1042
- del result_df[col]
1043
- if debug:
1044
- dprint(f"{dtypes=}")
1229
+ _process_select_columns(result_df)
1230
+ _process_omit_columns(result_df)
1231
+
1045
1232
  return result_df