meerschaum 2.5.0__py3-none-any.whl → 2.6.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. meerschaum/_internal/arguments/_parser.py +6 -1
  2. meerschaum/_internal/entry.py +16 -5
  3. meerschaum/actions/edit.py +6 -6
  4. meerschaum/actions/sql.py +12 -11
  5. meerschaum/api/dash/pipes.py +95 -13
  6. meerschaum/api/routes/_webterm.py +1 -0
  7. meerschaum/config/_edit.py +46 -19
  8. meerschaum/config/_read_config.py +20 -9
  9. meerschaum/config/_version.py +1 -1
  10. meerschaum/config/stack/__init__.py +1 -1
  11. meerschaum/connectors/sql/_pipes.py +80 -24
  12. meerschaum/connectors/sql/_sql.py +29 -10
  13. meerschaum/connectors/valkey/_pipes.py +1 -1
  14. meerschaum/core/Pipe/__init__.py +8 -9
  15. meerschaum/core/Pipe/_attributes.py +33 -11
  16. meerschaum/core/Pipe/_data.py +26 -7
  17. meerschaum/core/Pipe/_dtypes.py +4 -4
  18. meerschaum/core/Pipe/_fetch.py +1 -1
  19. meerschaum/core/Pipe/_sync.py +16 -4
  20. meerschaum/core/Pipe/_verify.py +1 -1
  21. meerschaum/utils/dataframe.py +58 -31
  22. meerschaum/utils/dtypes/__init__.py +16 -5
  23. meerschaum/utils/dtypes/sql.py +58 -28
  24. meerschaum/utils/misc.py +49 -16
  25. meerschaum/utils/packages/_packages.py +2 -1
  26. meerschaum/utils/schedule.py +7 -5
  27. meerschaum/utils/sql.py +224 -40
  28. {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/METADATA +5 -3
  29. {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/RECORD +35 -35
  30. {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/WHEEL +1 -1
  31. {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/LICENSE +0 -0
  32. {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/NOTICE +0 -0
  33. {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/entry_points.txt +0 -0
  34. {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/top_level.txt +0 -0
  35. {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/zip-safe +0 -0
@@ -404,7 +404,7 @@ def get_create_index_queries(
404
404
  indices = pipe.indices
405
405
 
406
406
  _datetime = pipe.get_columns('datetime', error=False)
407
- _datetime_type = pipe.dtypes.get(_datetime, 'datetime64[ns]')
407
+ _datetime_type = pipe.dtypes.get(_datetime, 'datetime64[ns, UTC]')
408
408
  _datetime_name = (
409
409
  sql_item_name(_datetime, self.flavor, None)
410
410
  if _datetime is not None else None
@@ -738,7 +738,7 @@ def get_pipe_data(
738
738
  dt_type = dtypes.get(_dt, 'object').lower()
739
739
  if 'datetime' not in dt_type:
740
740
  if 'int' not in dt_type:
741
- dtypes[_dt] = 'datetime64[ns]'
741
+ dtypes[_dt] = 'datetime64[ns, UTC]'
742
742
  existing_cols = pipe.get_columns_types(debug=debug)
743
743
  select_columns = (
744
744
  [
@@ -1197,7 +1197,12 @@ def sync_pipe(
1197
1197
  A `SuccessTuple` of success (`bool`) and message (`str`).
1198
1198
  """
1199
1199
  from meerschaum.utils.packages import import_pandas
1200
- from meerschaum.utils.sql import get_update_queries, sql_item_name, json_flavors, update_queries
1200
+ from meerschaum.utils.sql import (
1201
+ get_update_queries,
1202
+ sql_item_name,
1203
+ update_queries,
1204
+ get_create_table_queries,
1205
+ )
1201
1206
  from meerschaum.utils.misc import generate_password
1202
1207
  from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols
1203
1208
  from meerschaum.utils.dtypes import are_dtypes_equal
@@ -1232,7 +1237,6 @@ def sync_pipe(
1232
1237
 
1233
1238
  ### if table does not exist, create it with indices
1234
1239
  is_new = False
1235
- add_cols_query = None
1236
1240
  if not pipe.exists(debug=debug):
1237
1241
  check_existing = False
1238
1242
  is_new = True
@@ -1252,9 +1256,7 @@ def sync_pipe(
1252
1256
 
1253
1257
  ### NOTE: Oracle SQL < 23c (2023) and SQLite does not support booleans,
1254
1258
  ### so infer bools and persist them to `dtypes`.
1255
- ### MSSQL supports `BIT` for booleans, but we coerce bools to int for MSSQL
1256
- ### to avoid merge issues.
1257
- if self.flavor in ('oracle', 'sqlite', 'mssql', 'mysql', 'mariadb'):
1259
+ if self.flavor in ('oracle', 'sqlite', 'mysql', 'mariadb'):
1258
1260
  pipe_dtypes = pipe.dtypes
1259
1261
  new_bool_cols = {
1260
1262
  col: 'bool[pyarrow]'
@@ -1309,7 +1311,60 @@ def sync_pipe(
1309
1311
  'schema': self.get_pipe_schema(pipe),
1310
1312
  })
1311
1313
 
1314
+ primary_key = pipe.columns.get('primary', None)
1315
+ new_dtypes = {
1316
+ **{
1317
+ col: str(typ)
1318
+ for col, typ in unseen_df.dtypes.items()
1319
+ },
1320
+ **{
1321
+ col: 'int'
1322
+ for col_ix, col in pipe.columns.items()
1323
+ if col_ix != 'primary'
1324
+ },
1325
+ **pipe.dtypes
1326
+ } if is_new else {}
1327
+ autoincrement = (
1328
+ pipe.parameters.get('autoincrement', False)
1329
+ or (is_new and primary_key and primary_key not in new_dtypes)
1330
+ )
1331
+ if autoincrement and autoincrement not in pipe.parameters:
1332
+ pipe.parameters['autoincrement'] = autoincrement
1333
+ edit_success, edit_msg = pipe.edit(debug=debug)
1334
+ if not edit_success:
1335
+ return edit_success, edit_msg
1336
+
1337
+ if autoincrement and primary_key and primary_key not in df.columns:
1338
+ if unseen_df is not None and primary_key in unseen_df.columns:
1339
+ del unseen_df[primary_key]
1340
+ if update_df is not None and primary_key in update_df.columns:
1341
+ del update_df[primary_key]
1342
+ if delta_df is not None and primary_key in delta_df.columns:
1343
+ del delta_df[primary_key]
1344
+
1345
+ if is_new:
1346
+ if autoincrement:
1347
+ _ = new_dtypes.pop(primary_key, None)
1348
+
1349
+ ### TODO: see if this can be removed
1350
+ if 'datetime' in pipe.columns and self.flavor == 'timescaledb':
1351
+ primary_key = None
1352
+
1353
+ create_table_queries = get_create_table_queries(
1354
+ new_dtypes,
1355
+ pipe.target,
1356
+ self.flavor,
1357
+ schema=self.get_pipe_schema(pipe),
1358
+ primary_key=primary_key,
1359
+ )
1360
+ create_success = all(
1361
+ self.exec_queries(create_table_queries, break_on_error=True, rollback=True, debug=debug)
1362
+ )
1363
+ if not create_success:
1364
+ warn(f"Failed to create '{pipe.target}'. Continuing...")
1365
+
1312
1366
  stats = self.to_sql(unseen_df, **unseen_kw)
1367
+
1313
1368
  if is_new:
1314
1369
  if not self.create_indices(pipe, debug=debug):
1315
1370
  warn(f"Failed to create indices for {pipe}. Continuing...")
@@ -1358,7 +1413,7 @@ def sync_pipe(
1358
1413
  ]
1359
1414
  update_queries = get_update_queries(
1360
1415
  pipe.target,
1361
- temp_target,
1416
+ temp_target,
1362
1417
  self,
1363
1418
  join_cols,
1364
1419
  upsert=upsert,
@@ -1960,7 +2015,7 @@ def get_sync_time(
1960
2015
  table = sql_item_name(pipe.target, self.flavor, self.get_pipe_schema(pipe))
1961
2016
 
1962
2017
  dt_col = pipe.columns.get('datetime', None)
1963
- dt_type = pipe.dtypes.get(dt_col, 'datetime64[ns]')
2018
+ dt_type = pipe.dtypes.get(dt_col, 'datetime64[ns, UTC]')
1964
2019
  if not dt_col:
1965
2020
  _dt = pipe.guess_datetime()
1966
2021
  dt = sql_item_name(_dt, self.flavor, None) if _dt else None
@@ -2366,7 +2421,7 @@ def get_pipe_columns_types(
2366
2421
  ----------
2367
2422
  pipe: mrsm.Pipe:
2368
2423
  The pipe to get the columns for.
2369
-
2424
+
2370
2425
  Returns
2371
2426
  -------
2372
2427
  A dictionary of columns names (`str`) and types (`str`).
@@ -2381,17 +2436,18 @@ def get_pipe_columns_types(
2381
2436
  }
2382
2437
  >>>
2383
2438
  """
2439
+ from meerschaum.utils.sql import get_table_cols_types
2384
2440
  if not pipe.exists(debug=debug):
2385
2441
  return {}
2386
2442
 
2387
- if self.flavor == 'duckdb':
2388
- from meerschaum.utils.sql import get_table_cols_types
2389
- return get_table_cols_types(
2390
- pipe.target,
2391
- self,
2392
- flavor=self.flavor,
2393
- schema=self.get_pipe_schema(pipe),
2394
- )
2443
+ # if self.flavor not in ('oracle', 'mysql', 'mariadb'):
2444
+ return get_table_cols_types(
2445
+ pipe.target,
2446
+ self,
2447
+ flavor=self.flavor,
2448
+ schema=self.get_pipe_schema(pipe),
2449
+ debug=debug,
2450
+ )
2395
2451
 
2396
2452
  table_columns = {}
2397
2453
  try:
@@ -2823,11 +2879,11 @@ def get_alter_columns_queries(
2823
2879
 
2824
2880
 
2825
2881
  def get_to_sql_dtype(
2826
- self,
2827
- pipe: 'mrsm.Pipe',
2828
- df: 'pd.DataFrame',
2829
- update_dtypes: bool = True,
2830
- ) -> Dict[str, 'sqlalchemy.sql.visitors.TraversibleType']:
2882
+ self,
2883
+ pipe: 'mrsm.Pipe',
2884
+ df: 'pd.DataFrame',
2885
+ update_dtypes: bool = True,
2886
+ ) -> Dict[str, 'sqlalchemy.sql.visitors.TraversibleType']:
2831
2887
  """
2832
2888
  Given a pipe and DataFrame, return the `dtype` dictionary for `to_sql()`.
2833
2889
 
@@ -2947,7 +3003,7 @@ def deduplicate_pipe(
2947
3003
  duplicates_cte_name = sql_item_name('dups', self.flavor, None)
2948
3004
  duplicate_row_number_name = sql_item_name('dup_row_num', self.flavor, None)
2949
3005
  previous_row_number_name = sql_item_name('prev_row_num', self.flavor, None)
2950
-
3006
+
2951
3007
  index_list_str = (
2952
3008
  sql_item_name(dt_col, self.flavor, None)
2953
3009
  if dt_col
@@ -17,8 +17,8 @@ from meerschaum.utils.warnings import warn
17
17
  ### database flavors that can use bulk insert
18
18
  _bulk_flavors = {'postgresql', 'timescaledb', 'citus'}
19
19
  ### flavors that do not support chunks
20
- _disallow_chunks_flavors = ['duckdb']
21
- _max_chunks_flavors = {'sqlite': 1000,}
20
+ _disallow_chunks_flavors = []
21
+ _max_chunks_flavors = {'sqlite': 1000}
22
22
  SKIP_READ_TRANSACTION_FLAVORS: list[str] = ['mssql']
23
23
 
24
24
 
@@ -123,7 +123,8 @@ def read(
123
123
  if chunks is not None and chunks <= 0:
124
124
  return []
125
125
  from meerschaum.utils.sql import sql_item_name, truncate_item_name
126
- from meerschaum.utils.dtypes.sql import NUMERIC_PRECISION_FLAVORS
126
+ from meerschaum.utils.dtypes import are_dtypes_equal, coerce_timezone
127
+ from meerschaum.utils.dtypes.sql import NUMERIC_PRECISION_FLAVORS, TIMEZONE_NAIVE_FLAVORS
127
128
  from meerschaum.utils.packages import attempt_import, import_pandas
128
129
  from meerschaum.utils.pool import get_pool
129
130
  from meerschaum.utils.dataframe import chunksize_to_npartitions, get_numeric_cols
@@ -139,6 +140,16 @@ def read(
139
140
  if is_dask:
140
141
  chunksize = None
141
142
  schema = schema or self.schema
143
+ utc_dt_cols = [
144
+ col
145
+ for col, typ in dtype.items()
146
+ if are_dtypes_equal(typ, 'datetime') and 'utc' in typ.lower()
147
+ ] if dtype else []
148
+
149
+ if dtype and utc_dt_cols and self.flavor in TIMEZONE_NAIVE_FLAVORS:
150
+ dtype = dtype.copy()
151
+ for col in utc_dt_cols:
152
+ dtype[col] = 'datetime64[ns]'
142
153
 
143
154
  pool = get_pool(workers=workers)
144
155
  sqlalchemy = attempt_import("sqlalchemy")
@@ -162,7 +173,6 @@ def read(
162
173
  )
163
174
  chunksize = _max_chunks_flavors[self.flavor]
164
175
 
165
- ### NOTE: A bug in duckdb_engine does not allow for chunks.
166
176
  if chunksize is not None and self.flavor in _disallow_chunks_flavors:
167
177
  chunksize = None
168
178
 
@@ -206,6 +216,9 @@ def read(
206
216
  chunk_list = []
207
217
  chunk_hook_results = []
208
218
  def _process_chunk(_chunk, _retry_on_failure: bool = True):
219
+ if self.flavor in TIMEZONE_NAIVE_FLAVORS:
220
+ for col in utc_dt_cols:
221
+ _chunk[col] = coerce_timezone(_chunk[col], strip_timezone=False)
209
222
  if not as_hook_results:
210
223
  chunk_list.append(_chunk)
211
224
  if chunk_hook is None:
@@ -765,7 +778,7 @@ def to_sql(
765
778
  DROP_IF_EXISTS_FLAVORS,
766
779
  )
767
780
  from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols, get_uuid_cols
768
- from meerschaum.utils.dtypes import are_dtypes_equal, quantize_decimal
781
+ from meerschaum.utils.dtypes import are_dtypes_equal, quantize_decimal, coerce_timezone
769
782
  from meerschaum.utils.dtypes.sql import (
770
783
  NUMERIC_PRECISION_FLAVORS,
771
784
  PD_TO_SQLALCHEMY_DTYPES_FLAVORS,
@@ -848,7 +861,6 @@ def to_sql(
848
861
  if not success:
849
862
  warn(f"Unable to drop {name}")
850
863
 
851
-
852
864
  ### Enforce NVARCHAR(2000) as text instead of CLOB.
853
865
  dtype = to_sql_kw.get('dtype', {})
854
866
  for col, typ in df.dtypes.items():
@@ -858,11 +870,18 @@ def to_sql(
858
870
  dtype[col] = sqlalchemy.types.INTEGER
859
871
  to_sql_kw['dtype'] = dtype
860
872
  elif self.flavor == 'mssql':
873
+ pass
874
+ ### TODO clean this up
875
+ # dtype = to_sql_kw.get('dtype', {})
876
+ # for col, typ in df.dtypes.items():
877
+ # if are_dtypes_equal(str(typ), 'bool'):
878
+ # dtype[col] = sqlalchemy.types.INTEGER
879
+ # to_sql_kw['dtype'] = dtype
880
+ elif self.flavor == 'duckdb':
861
881
  dtype = to_sql_kw.get('dtype', {})
862
- for col, typ in df.dtypes.items():
863
- if are_dtypes_equal(str(typ), 'bool'):
864
- dtype[col] = sqlalchemy.types.INTEGER
865
- to_sql_kw['dtype'] = dtype
882
+ dt_cols = [col for col, typ in df.dtypes.items() if are_dtypes_equal(str(typ), 'datetime')]
883
+ for col in dt_cols:
884
+ df[col] = coerce_timezone(df[col], strip_utc=False)
866
885
 
867
886
  ### Check for JSON columns.
868
887
  if self.flavor not in json_flavors:
@@ -706,7 +706,7 @@ def get_sync_time(
706
706
  """
707
707
  from meerschaum.utils.dtypes import are_dtypes_equal
708
708
  dt_col = pipe.columns.get('datetime', None)
709
- dt_typ = pipe.dtypes.get(dt_col, 'datetime64[ns]')
709
+ dt_typ = pipe.dtypes.get(dt_col, 'datetime64[ns, UTC]')
710
710
  if not dt_col:
711
711
  return None
712
712
 
@@ -153,6 +153,7 @@ class Pipe:
153
153
  dtypes: Optional[Dict[str, str]] = None,
154
154
  instance: Optional[Union[str, InstanceConnector]] = None,
155
155
  temporary: bool = False,
156
+ upsert: Optional[bool] = None,
156
157
  mrsm_instance: Optional[Union[str, InstanceConnector]] = None,
157
158
  cache: bool = False,
158
159
  debug: bool = False,
@@ -201,6 +202,9 @@ class Pipe:
201
202
  instance: Optional[Union[str, InstanceConnector]], default None
202
203
  Alias for `mrsm_instance`. If `mrsm_instance` is supplied, this value is ignored.
203
204
 
205
+ upsert: Optional[bool], default None
206
+ If `True`, set `upsert` to `True` in the parameters.
207
+
204
208
  temporary: bool, default False
205
209
  If `True`, prevent instance tables (pipes, users, plugins) from being created.
206
210
 
@@ -268,7 +272,7 @@ class Pipe:
268
272
  or indexes
269
273
  or self._attributes.get('parameters', {}).get('indices', None)
270
274
  or self._attributes.get('parameters', {}).get('indexes', None)
271
- ) or columns
275
+ )
272
276
  if isinstance(indices, dict):
273
277
  indices_key = (
274
278
  'indexes'
@@ -292,6 +296,9 @@ class Pipe:
292
296
  elif dtypes is not None:
293
297
  warn(f"The provided dtypes are of invalid type '{type(dtypes)}'.")
294
298
 
299
+ if isinstance(upsert, bool):
300
+ self._attributes['parameters']['upsert'] = upsert
301
+
295
302
  ### NOTE: The parameters dictionary is {} by default.
296
303
  ### A Pipe may be registered without parameters, then edited,
297
304
  ### or a Pipe may be registered with parameters set in-memory first.
@@ -308,7 +315,6 @@ class Pipe:
308
315
 
309
316
  self._cache = cache and get_config('system', 'experimental', 'cache')
310
317
 
311
-
312
318
  @property
313
319
  def meta(self):
314
320
  """
@@ -321,7 +327,6 @@ class Pipe:
321
327
  'instance': self.instance_keys,
322
328
  }
323
329
 
324
-
325
330
  def keys(self) -> List[str]:
326
331
  """
327
332
  Return the ordered keys for this pipe.
@@ -332,7 +337,6 @@ class Pipe:
332
337
  if key != 'instance'
333
338
  }
334
339
 
335
-
336
340
  @property
337
341
  def instance_connector(self) -> Union[InstanceConnector, None]:
338
342
  """
@@ -369,7 +373,6 @@ class Pipe:
369
373
  return None
370
374
  return self._connector
371
375
 
372
-
373
376
  @property
374
377
  def cache_connector(self) -> Union[meerschaum.connectors.sql.SQLConnector, None]:
375
378
  """
@@ -391,7 +394,6 @@ class Pipe:
391
394
 
392
395
  return self._cache_connector
393
396
 
394
-
395
397
  @property
396
398
  def cache_pipe(self) -> Union['meerschaum.Pipe', None]:
397
399
  """
@@ -433,11 +435,9 @@ class Pipe:
433
435
 
434
436
  return self._cache_pipe
435
437
 
436
-
437
438
  def __str__(self, ansi: bool=False):
438
439
  return pipe_repr(self, ansi=ansi)
439
440
 
440
-
441
441
  def __eq__(self, other):
442
442
  try:
443
443
  return (
@@ -489,7 +489,6 @@ class Pipe:
489
489
  """
490
490
  self.__init__(**_state)
491
491
 
492
-
493
492
  def __getitem__(self, key: str) -> Any:
494
493
  """
495
494
  Index the pipe's attributes.
@@ -103,10 +103,25 @@ def indices(self) -> Union[Dict[str, Union[str, List[str]]], None]:
103
103
  if indices_key not in self.parameters:
104
104
  self.parameters[indices_key] = {}
105
105
  _indices = self.parameters[indices_key]
106
+ _columns = self.columns
107
+ dt_col = _columns.get('datetime', None)
106
108
  if not isinstance(_indices, dict):
107
109
  _indices = {}
108
110
  self.parameters[indices_key] = _indices
109
- return {**self.columns, **_indices}
111
+ unique_cols = (
112
+ [dt_col]
113
+ if dt_col
114
+ else []
115
+ ) + [
116
+ col
117
+ for col_ix, col in _columns.items()
118
+ if col_ix != 'datetime'
119
+ ]
120
+ return {
121
+ **({'unique': unique_cols} if len(unique_cols) > 1 else {}),
122
+ **_columns,
123
+ **_indices
124
+ }
110
125
 
111
126
 
112
127
  @property
@@ -196,7 +211,7 @@ def get_columns(self, *args: str, error: bool = False) -> Union[str, Tuple[str]]
196
211
  ----------
197
212
  *args: str
198
213
  The column names to be retrieved.
199
-
214
+
200
215
  error: bool, default False
201
216
  If `True`, raise an `Exception` if the specified column is not defined.
202
217
 
@@ -509,15 +524,22 @@ def get_indices(self) -> Dict[str, str]:
509
524
  if cols
510
525
  }
511
526
  _index_names = {
512
- ix: (
513
- _index_template.format(
514
- target=_target,
515
- column_names=column_names,
516
- connector_keys=self.connector_keys,
517
- metric_key=self.connector_key,
518
- location_key=self.location_key,
519
- )
527
+ ix: _index_template.format(
528
+ target=_target,
529
+ column_names=column_names,
530
+ connector_keys=self.connector_keys,
531
+ metric_key=self.connector_key,
532
+ location_key=self.location_key,
520
533
  )
521
534
  for ix, column_names in _column_names.items()
522
535
  }
523
- return _index_names
536
+ ### NOTE: Skip any duplicate indices.
537
+ seen_index_names = {}
538
+ for ix, index_name in _index_names.items():
539
+ if index_name in seen_index_names:
540
+ continue
541
+ seen_index_names[index_name] = ix
542
+ return {
543
+ ix: index_name
544
+ for index_name, ix in seen_index_names.items()
545
+ }
@@ -23,8 +23,8 @@ def get_data(
23
23
  self,
24
24
  select_columns: Optional[List[str]] = None,
25
25
  omit_columns: Optional[List[str]] = None,
26
- begin: Union[datetime, int, None] = None,
27
- end: Union[datetime, int, None] = None,
26
+ begin: Union[datetime, int, str, None] = None,
27
+ end: Union[datetime, int, str, None] = None,
28
28
  params: Optional[Dict[str, Any]] = None,
29
29
  as_iterator: bool = False,
30
30
  as_chunks: bool = False,
@@ -48,12 +48,12 @@ def get_data(
48
48
  omit_columns: Optional[List[str]], default None
49
49
  If provided, remove these columns from the selection.
50
50
 
51
- begin: Union[datetime, int, None], default None
51
+ begin: Union[datetime, int, str, None], default None
52
52
  Lower bound datetime to begin searching for data (inclusive).
53
53
  Translates to a `WHERE` clause like `WHERE datetime >= begin`.
54
54
  Defaults to `None`.
55
55
 
56
- end: Union[datetime, int, None], default None
56
+ end: Union[datetime, int, str, None], default None
57
57
  Upper bound datetime to stop searching for data (inclusive).
58
58
  Translates to a `WHERE` clause like `WHERE datetime < end`.
59
59
  Defaults to `None`.
@@ -105,11 +105,12 @@ def get_data(
105
105
  from meerschaum.utils.venv import Venv
106
106
  from meerschaum.connectors import get_connector_plugin
107
107
  from meerschaum.utils.misc import iterate_chunks, items_str
108
- from meerschaum.utils.dtypes import to_pandas_dtype
108
+ from meerschaum.utils.dtypes import to_pandas_dtype, coerce_timezone
109
109
  from meerschaum.utils.dataframe import add_missing_cols_to_df, df_is_chunk_generator
110
110
  from meerschaum.utils.packages import attempt_import
111
111
  dd = attempt_import('dask.dataframe') if as_dask else None
112
112
  dask = attempt_import('dask') if as_dask else None
113
+ dateutil_parser = attempt_import('dateutil.parser')
113
114
 
114
115
  if select_columns == '*':
115
116
  select_columns = None
@@ -120,11 +121,29 @@ def get_data(
120
121
  omit_columns = [omit_columns]
121
122
 
122
123
  as_iterator = as_iterator or as_chunks
124
+ dt_col = self.columns.get('datetime', None)
125
+ dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
126
+ dt_is_utc = 'utc' in dt_typ.lower()
127
+ if isinstance(begin, str):
128
+ try:
129
+ begin = dateutil_parser.parse(begin)
130
+ except Exception as e:
131
+ warn(f"Failed to parse '{begin}' as datetime:\n{e}")
132
+ begin = None
133
+ if isinstance(end, str):
134
+ try:
135
+ end = dateutil_parser.parse(end)
136
+ except Exception as e:
137
+ warn(f"Failed to parse '{end}' as datetime:\n{e}")
138
+ end = None
139
+ if isinstance(begin, datetime):
140
+ begin = coerce_timezone(begin, strip_utc=(not dt_is_utc))
141
+ if isinstance(end, datetime):
142
+ end = coerce_timezone(end, strip_utc=(not dt_is_utc))
123
143
 
124
144
  def _sort_df(_df):
125
145
  if df_is_chunk_generator(_df):
126
146
  return _df
127
- dt_col = self.columns.get('datetime', None)
128
147
  indices = [] if dt_col not in _df.columns else [dt_col]
129
148
  non_dt_cols = [
130
149
  col
@@ -607,7 +626,7 @@ def get_chunk_interval(
607
626
  if dt_col is None:
608
627
  return timedelta(minutes=chunk_minutes)
609
628
 
610
- dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
629
+ dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
611
630
  if 'int' in dt_dtype.lower():
612
631
  return chunk_minutes
613
632
  return timedelta(minutes=chunk_minutes)
@@ -101,18 +101,18 @@ def infer_dtypes(self, persist: bool = False, debug: bool = False) -> Dict[str,
101
101
  dt_col = self.columns.get('datetime', None)
102
102
  if dt_col:
103
103
  if not self.parameters.get('dtypes', {}).get(dt_col, None):
104
- dtypes[dt_col] = 'datetime64[ns]'
104
+ dtypes[dt_col] = 'datetime64[ns, UTC]'
105
105
  return dtypes
106
106
 
107
- from meerschaum.utils.sql import get_pd_type
108
- from meerschaum.utils.misc import to_pandas_dtype
107
+ from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
108
+ from meerschaum.utils.dtypes import to_pandas_dtype
109
109
  columns_types = self.get_columns_types(debug=debug)
110
110
 
111
111
  ### NOTE: get_columns_types() may return either the types as
112
112
  ### PostgreSQL- or Pandas-style.
113
113
  dtypes = {
114
114
  c: (
115
- get_pd_type(t, allow_custom_dtypes=True)
115
+ get_pd_type_from_db_type(t, allow_custom_dtypes=True)
116
116
  if str(t).isupper()
117
117
  else to_pandas_dtype(t)
118
118
  )
@@ -125,7 +125,7 @@ def get_backtrack_interval(
125
125
  if dt_col is None:
126
126
  return backtrack_interval
127
127
 
128
- dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
128
+ dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
129
129
  if 'int' in dt_dtype.lower():
130
130
  return backtrack_minutes
131
131
 
@@ -624,6 +624,18 @@ def filter_existing(
624
624
  merge = pd.merge
625
625
  NA = pd.NA
626
626
 
627
+ primary_key = self.columns.get('primary', None)
628
+ autoincrement = self.parameters.get('autoincrement', False)
629
+ pipe_columns = self.columns.copy()
630
+
631
+ if primary_key and autoincrement and df is not None and primary_key in df.columns:
632
+ if safe_copy:
633
+ df = df.copy()
634
+ safe_copy = False
635
+ if df[primary_key].isnull().all():
636
+ del df[primary_key]
637
+ _ = self.columns.pop(primary_key, None)
638
+
627
639
  def get_empty_df():
628
640
  empty_df = pd.DataFrame([])
629
641
  dtypes = dict(df.dtypes) if df is not None else {}
@@ -643,8 +655,8 @@ def filter_existing(
643
655
 
644
656
  ### begin is the oldest data in the new dataframe
645
657
  begin, end = None, None
646
- dt_col = self.columns.get('datetime', None)
647
- dt_type = self.dtypes.get(dt_col, 'datetime64[ns]') if dt_col else None
658
+ dt_col = pipe_columns.get('datetime', None)
659
+ dt_type = self.dtypes.get(dt_col, 'datetime64[ns, UTC]') if dt_col else None
648
660
  try:
649
661
  min_dt_val = df[dt_col].min(skipna=True) if dt_col else None
650
662
  if is_dask and min_dt_val is not None:
@@ -713,7 +725,7 @@ def filter_existing(
713
725
 
714
726
  unique_index_vals = {
715
727
  col: df[col].unique()
716
- for col in self.columns
728
+ for col in pipe_columns
717
729
  if col in df.columns and col != dt_col
718
730
  } if not date_bound_only else {}
719
731
  filter_params_index_limit = get_config('pipes', 'sync', 'filter_params_index_limit')
@@ -749,7 +761,7 @@ def filter_existing(
749
761
 
750
762
  ### Separate new rows from changed ones.
751
763
  on_cols = [
752
- col for col_key, col in self.columns.items()
764
+ col for col_key, col in pipe_columns.items()
753
765
  if (
754
766
  col
755
767
  and
@@ -394,7 +394,7 @@ def get_bound_interval(self, debug: bool = False) -> Union[timedelta, int, None]
394
394
  if not dt_col:
395
395
  return bound_time_value
396
396
 
397
- dt_typ = self.dtypes.get(dt_col, 'datetime64[ns]')
397
+ dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
398
398
  if 'int' in dt_typ.lower():
399
399
  return int(bound_time_value)
400
400