meerschaum 2.6.16__py3-none-any.whl → 2.7.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. meerschaum/_internal/arguments/_parse_arguments.py +1 -1
  2. meerschaum/actions/delete.py +65 -69
  3. meerschaum/actions/edit.py +22 -2
  4. meerschaum/actions/install.py +1 -2
  5. meerschaum/actions/sync.py +2 -3
  6. meerschaum/config/_default.py +1 -1
  7. meerschaum/config/_paths.py +2 -1
  8. meerschaum/config/_version.py +1 -1
  9. meerschaum/connectors/api/_pipes.py +4 -3
  10. meerschaum/connectors/sql/_create_engine.py +3 -3
  11. meerschaum/connectors/sql/_pipes.py +84 -38
  12. meerschaum/connectors/sql/_sql.py +6 -1
  13. meerschaum/connectors/valkey/_pipes.py +12 -1
  14. meerschaum/core/Pipe/__init__.py +23 -13
  15. meerschaum/core/Pipe/_attributes.py +19 -0
  16. meerschaum/core/Pipe/_dtypes.py +1 -1
  17. meerschaum/core/Pipe/_sync.py +61 -21
  18. meerschaum/core/Pipe/_verify.py +8 -7
  19. meerschaum/jobs/_Job.py +2 -1
  20. meerschaum/plugins/_Plugin.py +11 -14
  21. meerschaum/utils/daemon/Daemon.py +20 -13
  22. meerschaum/utils/dataframe.py +175 -13
  23. meerschaum/utils/dtypes/__init__.py +103 -14
  24. meerschaum/utils/dtypes/sql.py +26 -0
  25. meerschaum/utils/misc.py +8 -8
  26. meerschaum/utils/packages/_packages.py +1 -1
  27. meerschaum/utils/schedule.py +8 -3
  28. meerschaum/utils/sql.py +70 -47
  29. meerschaum/utils/venv/_Venv.py +4 -4
  30. meerschaum/utils/venv/__init__.py +33 -13
  31. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/METADATA +2 -2
  32. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/RECORD +38 -38
  33. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/LICENSE +0 -0
  34. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/NOTICE +0 -0
  35. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/WHEEL +0 -0
  36. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/entry_points.txt +0 -0
  37. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/top_level.txt +0 -0
  38. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/zip-safe +0 -0
@@ -460,10 +460,16 @@ def get_create_index_queries(
460
460
  else None
461
461
  )
462
462
  primary_key_constraint_name = (
463
- sql_item_name(f'pk_{pipe.target}', self.flavor, None)
463
+ sql_item_name(f'PK_{pipe.target}', self.flavor, None)
464
464
  if primary_key is not None
465
465
  else None
466
466
  )
467
+ primary_key_clustered = "CLUSTERED" if _datetime is None else "NONCLUSTERED"
468
+ datetime_clustered = (
469
+ "CLUSTERED"
470
+ if not existing_primary_keys and _datetime is not None
471
+ else "NONCLUSTERED"
472
+ )
467
473
 
468
474
  _id_index_name = (
469
475
  sql_item_name(index_names['id'], self.flavor, None)
@@ -474,6 +480,7 @@ def get_create_index_queries(
474
480
  _create_space_partition = get_config('system', 'experimental', 'space')
475
481
 
476
482
  ### create datetime index
483
+ dt_query = None
477
484
  if _datetime is not None:
478
485
  if self.flavor == 'timescaledb' and pipe.parameters.get('hypertable', True):
479
486
  _id_count = (
@@ -504,19 +511,19 @@ def get_create_index_queries(
504
511
  + 'if_not_exists => true, '
505
512
  + "migrate_data => true);"
506
513
  )
507
- elif self.flavor == 'mssql':
508
- dt_query = (
509
- "CREATE "
510
- + ("CLUSTERED " if not primary_key else '')
511
- + f"INDEX {_datetime_index_name} "
512
- + f"ON {_pipe_name} ({_datetime_name})"
513
- )
514
- else: ### mssql, sqlite, etc.
515
- dt_query = (
516
- f"CREATE INDEX {_datetime_index_name} "
517
- + f"ON {_pipe_name} ({_datetime_name})"
518
- )
514
+ elif _datetime_index_name:
515
+ if self.flavor == 'mssql':
516
+ dt_query = (
517
+ f"CREATE {datetime_clustered} INDEX {_datetime_index_name} "
518
+ f"ON {_pipe_name} ({_datetime_name})"
519
+ )
520
+ else:
521
+ dt_query = (
522
+ f"CREATE INDEX {_datetime_index_name} "
523
+ + f"ON {_pipe_name} ({_datetime_name})"
524
+ )
519
525
 
526
+ if dt_query:
520
527
  index_queries[_datetime] = [dt_query]
521
528
 
522
529
  primary_queries = []
@@ -623,7 +630,7 @@ def get_create_index_queries(
623
630
  ),
624
631
  (
625
632
  f"ALTER TABLE {_pipe_name}\n"
626
- f"ADD CONSTRAINT {primary_key_constraint_name} PRIMARY KEY ({primary_key_name})"
633
+ f"ADD CONSTRAINT {primary_key_constraint_name} PRIMARY KEY {primary_key_clustered} ({primary_key_name})"
627
634
  ),
628
635
  ])
629
636
  index_queries[primary_key] = primary_queries
@@ -875,6 +882,7 @@ def get_pipe_data(
875
882
  from meerschaum.utils.dtypes import (
876
883
  attempt_cast_to_numeric,
877
884
  attempt_cast_to_uuid,
885
+ attempt_cast_to_bytes,
878
886
  are_dtypes_equal,
879
887
  )
880
888
  from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
@@ -891,17 +899,15 @@ def get_pipe_data(
891
899
  col: get_pd_type_from_db_type(typ)
892
900
  for col, typ in cols_types.items()
893
901
  }
894
- }
902
+ } if pipe.enforce else {}
895
903
  if dtypes:
896
904
  if self.flavor == 'sqlite':
897
905
  if not pipe.columns.get('datetime', None):
898
906
  _dt = pipe.guess_datetime()
899
907
  dt = sql_item_name(_dt, self.flavor, None) if _dt else None
900
- is_guess = True
901
908
  else:
902
909
  _dt = pipe.get_columns('datetime')
903
910
  dt = sql_item_name(_dt, self.flavor, None)
904
- is_guess = False
905
911
 
906
912
  if _dt:
907
913
  dt_type = dtypes.get(_dt, 'object').lower()
@@ -929,7 +935,7 @@ def get_pipe_data(
929
935
  col: to_pandas_dtype(typ)
930
936
  for col, typ in dtypes.items()
931
937
  if col in select_columns and col not in (omit_columns or [])
932
- }
938
+ } if pipe.enforce else {}
933
939
  query = self.get_pipe_data_query(
934
940
  pipe,
935
941
  select_columns=select_columns,
@@ -959,6 +965,11 @@ def get_pipe_data(
959
965
  for col, typ in pipe.dtypes.items()
960
966
  if typ == 'uuid' and col in dtypes
961
967
  ]
968
+ bytes_columns = [
969
+ col
970
+ for col, typ in pipe.dtypes.items()
971
+ if typ == 'bytes' and col in dtypes
972
+ ]
962
973
 
963
974
  kw['coerce_float'] = kw.get('coerce_float', (len(numeric_columns) == 0))
964
975
 
@@ -978,6 +989,11 @@ def get_pipe_data(
978
989
  continue
979
990
  df[col] = df[col].apply(attempt_cast_to_uuid)
980
991
 
992
+ for col in bytes_columns:
993
+ if col not in df.columns:
994
+ continue
995
+ df[col] = df[col].apply(attempt_cast_to_bytes)
996
+
981
997
  if self.flavor == 'sqlite':
982
998
  ignore_dt_cols = [
983
999
  col
@@ -1339,7 +1355,13 @@ def create_pipe_table_from_df(
1339
1355
  """
1340
1356
  Create a pipe's table from its configured dtypes and an incoming dataframe.
1341
1357
  """
1342
- from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols, get_uuid_cols
1358
+ from meerschaum.utils.dataframe import (
1359
+ get_json_cols,
1360
+ get_numeric_cols,
1361
+ get_uuid_cols,
1362
+ get_datetime_cols,
1363
+ get_bytes_cols,
1364
+ )
1343
1365
  from meerschaum.utils.sql import get_create_table_queries, sql_item_name
1344
1366
  primary_key = pipe.columns.get('primary', None)
1345
1367
  dt_col = pipe.columns.get('datetime', None)
@@ -1365,6 +1387,18 @@ def create_pipe_table_from_df(
1365
1387
  col: 'numeric'
1366
1388
  for col in get_numeric_cols(df)
1367
1389
  },
1390
+ **{
1391
+ col: 'bytes'
1392
+ for col in get_bytes_cols(df)
1393
+ },
1394
+ **{
1395
+ col: 'datetime64[ns, UTC]'
1396
+ for col in get_datetime_cols(df, timezone_aware=True, timezone_naive=False)
1397
+ },
1398
+ **{
1399
+ col: 'datetime64[ns]'
1400
+ for col in get_datetime_cols(df, timezone_aware=False, timezone_naive=True)
1401
+ },
1368
1402
  **pipe.dtypes
1369
1403
  }
1370
1404
  autoincrement = (
@@ -1455,11 +1489,9 @@ def sync_pipe(
1455
1489
  get_update_queries,
1456
1490
  sql_item_name,
1457
1491
  update_queries,
1458
- get_create_table_queries,
1459
1492
  get_reset_autoincrement_queries,
1460
1493
  )
1461
1494
  from meerschaum.utils.misc import generate_password
1462
- from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols, get_uuid_cols
1463
1495
  from meerschaum.utils.dtypes import are_dtypes_equal
1464
1496
  from meerschaum.utils.dtypes.sql import get_db_type_from_pd_type
1465
1497
  from meerschaum import Pipe
@@ -1572,6 +1604,7 @@ def sync_pipe(
1572
1604
  'schema': self.get_pipe_schema(pipe),
1573
1605
  })
1574
1606
 
1607
+ dt_col = pipe.columns.get('datetime', None)
1575
1608
  primary_key = pipe.columns.get('primary', None)
1576
1609
  autoincrement = (
1577
1610
  pipe.parameters.get('autoincrement', False)
@@ -1589,17 +1622,23 @@ def sync_pipe(
1589
1622
  if not edit_success:
1590
1623
  return edit_success, edit_msg
1591
1624
 
1592
- autoincrement_needs_reset = False
1625
+ def _check_pk(_df_to_clear):
1626
+ if _df_to_clear is None:
1627
+ return
1628
+ if primary_key not in _df_to_clear.columns:
1629
+ return
1630
+ if not _df_to_clear[primary_key].notnull().any():
1631
+ del _df_to_clear[primary_key]
1632
+
1633
+ autoincrement_needs_reset = bool(
1634
+ autoincrement
1635
+ and primary_key
1636
+ and primary_key in unseen_df.columns
1637
+ and unseen_df[primary_key].notnull().any()
1638
+ )
1593
1639
  if autoincrement and primary_key:
1594
- if primary_key not in df.columns:
1595
- if unseen_df is not None and primary_key in unseen_df.columns:
1596
- del unseen_df[primary_key]
1597
- if update_df is not None and primary_key in update_df.columns:
1598
- del update_df[primary_key]
1599
- if delta_df is not None and primary_key in delta_df.columns:
1600
- del delta_df[primary_key]
1601
- elif unseen_df[primary_key].notnull().any():
1602
- autoincrement_needs_reset = True
1640
+ for _df_to_clear in (unseen_df, update_df, delta_df):
1641
+ _check_pk(_df_to_clear)
1603
1642
 
1604
1643
  if is_new:
1605
1644
  create_success, create_msg = self.create_pipe_table_from_df(
@@ -1612,6 +1651,7 @@ def sync_pipe(
1612
1651
 
1613
1652
  do_identity_insert = bool(
1614
1653
  self.flavor in ('mssql',)
1654
+ and primary_key
1615
1655
  and primary_key in unseen_df.columns
1616
1656
  and autoincrement
1617
1657
  )
@@ -1707,7 +1747,11 @@ def sync_pipe(
1707
1747
  col
1708
1748
  for col_key, col in pipe.columns.items()
1709
1749
  if col and col in existing_cols
1710
- ]
1750
+ ] if not primary_key else (
1751
+ [dt_col, primary_key]
1752
+ if self.flavor == 'timescaledb' and dt_col and dt_col in update_df.columns
1753
+ else [primary_key]
1754
+ )
1711
1755
  update_queries = get_update_queries(
1712
1756
  pipe.target,
1713
1757
  temp_target,
@@ -1716,7 +1760,8 @@ def sync_pipe(
1716
1760
  upsert=upsert,
1717
1761
  schema=self.get_pipe_schema(pipe),
1718
1762
  patch_schema=self.internal_schema,
1719
- datetime_col=pipe.columns.get('datetime', None),
1763
+ datetime_col=(dt_col if dt_col in update_df.columns else None),
1764
+ identity_insert=(autoincrement and primary_key in update_df.columns),
1720
1765
  debug=debug,
1721
1766
  )
1722
1767
  update_success = all(
@@ -1834,7 +1879,6 @@ def sync_pipe_inplace(
1834
1879
  session_execute,
1835
1880
  update_queries,
1836
1881
  )
1837
- from meerschaum.utils.dtypes import are_dtypes_equal
1838
1882
  from meerschaum.utils.dtypes.sql import (
1839
1883
  get_pd_type_from_db_type,
1840
1884
  )
@@ -2054,6 +2098,7 @@ def sync_pipe_inplace(
2054
2098
  ) if not (upsert or static) else new_cols_types
2055
2099
 
2056
2100
  common_cols = [col for col in new_cols if col in backtrack_cols_types]
2101
+ primary_key = pipe.columns.get('primary', None)
2057
2102
  on_cols = {
2058
2103
  col: new_cols.get(col)
2059
2104
  for col_key, col in pipe.columns.items()
@@ -2064,7 +2109,7 @@ def sync_pipe_inplace(
2064
2109
  and col in backtrack_cols_types
2065
2110
  and col in new_cols
2066
2111
  )
2067
- }
2112
+ } if not primary_key else {primary_key: new_cols.get(primary_key)}
2068
2113
 
2069
2114
  null_replace_new_cols_str = (
2070
2115
  ', '.join([
@@ -2591,7 +2636,7 @@ def get_pipe_rowcount(
2591
2636
  result = self.value(query, debug=debug, silent=True)
2592
2637
  try:
2593
2638
  return int(result)
2594
- except Exception as e:
2639
+ except Exception:
2595
2640
  return None
2596
2641
 
2597
2642
 
@@ -2616,10 +2661,11 @@ def drop_pipe(
2616
2661
  from meerschaum.utils.sql import table_exists, sql_item_name, DROP_IF_EXISTS_FLAVORS
2617
2662
  success = True
2618
2663
  target = pipe.target
2664
+ schema = self.get_pipe_schema(pipe)
2619
2665
  target_name = (
2620
- sql_item_name(target, self.flavor, self.get_pipe_schema(pipe))
2666
+ sql_item_name(target, self.flavor, schema)
2621
2667
  )
2622
- if table_exists(target, self, debug=debug):
2668
+ if table_exists(target, self, schema=schema, debug=debug):
2623
2669
  if_exists_str = "IF EXISTS" if self.flavor in DROP_IF_EXISTS_FLAVORS else ""
2624
2670
  success = self.exec(
2625
2671
  f"DROP TABLE {if_exists_str} {target_name}", silent=True, debug=debug
@@ -790,7 +790,12 @@ def to_sql(
790
790
  truncate_item_name,
791
791
  DROP_IF_EXISTS_FLAVORS,
792
792
  )
793
- from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols, get_uuid_cols
793
+ from meerschaum.utils.dataframe import (
794
+ get_json_cols,
795
+ get_numeric_cols,
796
+ get_uuid_cols,
797
+ get_bytes_cols,
798
+ )
794
799
  from meerschaum.utils.dtypes import are_dtypes_equal, quantize_decimal, coerce_timezone
795
800
  from meerschaum.utils.dtypes.sql import (
796
801
  NUMERIC_PRECISION_FLAVORS,
@@ -46,9 +46,20 @@ def serialize_document(doc: Dict[str, Any]) -> str:
46
46
  -------
47
47
  A serialized string for the document.
48
48
  """
49
+ from meerschaum.utils.dtypes import serialize_bytes
49
50
  return json.dumps(
50
51
  doc,
51
- default=(lambda x: json_serialize_datetime(x) if hasattr(x, 'tzinfo') else str(x)),
52
+ default=(
53
+ lambda x: (
54
+ json_serialize_datetime(x)
55
+ if hasattr(x, 'tzinfo')
56
+ else (
57
+ serialize_bytes(x)
58
+ if isinstance(x, bytes)
59
+ else str(x)
60
+ )
61
+ )
62
+ ),
52
63
  separators=(',', ':'),
53
64
  sort_keys=True,
54
65
  )
@@ -106,6 +106,7 @@ class Pipe:
106
106
  upsert,
107
107
  static,
108
108
  tzinfo,
109
+ enforce,
109
110
  get_columns,
110
111
  get_columns_types,
111
112
  get_columns_indices,
@@ -132,6 +133,7 @@ class Pipe:
132
133
  _persist_new_json_columns,
133
134
  _persist_new_numeric_columns,
134
135
  _persist_new_uuid_columns,
136
+ _persist_new_bytes_columns,
135
137
  )
136
138
  from ._verify import (
137
139
  verify,
@@ -162,12 +164,14 @@ class Pipe:
162
164
  upsert: Optional[bool] = None,
163
165
  autoincrement: Optional[bool] = None,
164
166
  static: Optional[bool] = None,
167
+ enforce: Optional[bool] = None,
165
168
  mrsm_instance: Optional[Union[str, InstanceConnector]] = None,
166
169
  cache: bool = False,
167
170
  debug: bool = False,
168
171
  connector_keys: Optional[str] = None,
169
172
  metric_key: Optional[str] = None,
170
173
  location_key: Optional[str] = None,
174
+ instance_keys: Optional[str] = None,
171
175
  indexes: Union[Dict[str, str], List[str], None] = None,
172
176
  ):
173
177
  """
@@ -219,6 +223,10 @@ class Pipe:
219
223
  static: Optional[bool], default None
220
224
  If `True`, set `static` in the parameters.
221
225
 
226
+ enforce: Optionanl[bool], default None
227
+ If `False`, skip data type enforcement.
228
+ Default behavior is `True`.
229
+
222
230
  temporary: bool, default False
223
231
  If `True`, prevent instance tables (pipes, users, plugins) from being created.
224
232
 
@@ -319,11 +327,13 @@ class Pipe:
319
327
  if isinstance(static, bool):
320
328
  self._attributes['parameters']['static'] = static
321
329
 
330
+ if isinstance(enforce, bool):
331
+ self._attributes['parameters']['enforce'] = enforce
332
+
322
333
  ### NOTE: The parameters dictionary is {} by default.
323
334
  ### A Pipe may be registered without parameters, then edited,
324
335
  ### or a Pipe may be registered with parameters set in-memory first.
325
- # from meerschaum.config import get_config
326
- _mrsm_instance = mrsm_instance if mrsm_instance is not None else instance
336
+ _mrsm_instance = mrsm_instance if mrsm_instance is not None else (instance or instance_keys)
327
337
  if _mrsm_instance is None:
328
338
  _mrsm_instance = get_config('meerschaum', 'instance', patch=True)
329
339
 
@@ -341,10 +351,10 @@ class Pipe:
341
351
  Return the four keys needed to reconstruct this pipe.
342
352
  """
343
353
  return {
344
- 'connector': self.connector_keys,
345
- 'metric': self.metric_key,
346
- 'location': self.location_key,
347
- 'instance': self.instance_keys,
354
+ 'connector_keys': self.connector_keys,
355
+ 'metric_key': self.metric_key,
356
+ 'location_key': self.location_key,
357
+ 'instance_keys': self.instance_keys,
348
358
  }
349
359
 
350
360
  def keys(self) -> List[str]:
@@ -385,7 +395,7 @@ class Pipe:
385
395
  warnings.simplefilter('ignore')
386
396
  try:
387
397
  conn = parse_instance_keys(self.connector_keys)
388
- except Exception as e:
398
+ except Exception:
389
399
  conn = None
390
400
  if conn:
391
401
  self._connector = conn
@@ -429,7 +439,7 @@ class Pipe:
429
439
  _fetch_patch = {
430
440
  'fetch': ({
431
441
  'definition': (
432
- f"SELECT * FROM "
442
+ "SELECT * FROM "
433
443
  + sql_item_name(
434
444
  str(self.target),
435
445
  self.instance_connector.flavor,
@@ -467,7 +477,7 @@ class Pipe:
467
477
  and self.location_key == other.location_key
468
478
  and self.instance_keys == other.instance_keys
469
479
  )
470
- except Exception as e:
480
+ except Exception:
471
481
  return False
472
482
 
473
483
  def __hash__(self):
@@ -496,11 +506,11 @@ class Pipe:
496
506
  Define the state dictionary (pickling).
497
507
  """
498
508
  return {
499
- 'connector': self.connector_keys,
500
- 'metric': self.metric_key,
501
- 'location': self.location_key,
509
+ 'connector_keys': self.connector_keys,
510
+ 'metric_key': self.metric_key,
511
+ 'location_key': self.location_key,
502
512
  'parameters': self.parameters,
503
- 'instance': self.instance_keys,
513
+ 'instance_keys': self.instance_keys,
504
514
  }
505
515
 
506
516
  def __setstate__(self, _state: Dict[str, Any]):
@@ -289,6 +289,25 @@ def tzinfo(self) -> Union[None, timezone]:
289
289
  return None
290
290
 
291
291
 
292
+ @property
293
+ def enforce(self) -> bool:
294
+ """
295
+ Return the `enforce` parameter for the pipe.
296
+ """
297
+ if 'enforce' not in self.parameters:
298
+ self.parameters['enforce'] = True
299
+
300
+ return self.parameters['enforce']
301
+
302
+
303
+ @enforce.setter
304
+ def enforce(self, _enforce: bool) -> None:
305
+ """
306
+ Set the `enforce` parameter for the pipe.
307
+ """
308
+ self.parameters['_enforce'] = _enforce
309
+
310
+
292
311
  def get_columns(self, *args: str, error: bool = False) -> Union[str, Tuple[str]]:
293
312
  """
294
313
  Check if the requested columns are defined.
@@ -41,7 +41,7 @@ def enforce_dtypes(
41
41
  )
42
42
  return df
43
43
 
44
- pipe_dtypes = self.dtypes
44
+ pipe_dtypes = self.dtypes if self.enforce else {}
45
45
 
46
46
  try:
47
47
  if isinstance(df, str):
@@ -368,10 +368,11 @@ def sync(
368
368
  ### Cast to a dataframe and ensure datatypes are what we expect.
369
369
  df = self.enforce_dtypes(df, chunksize=chunksize, debug=debug)
370
370
 
371
- ### Capture `numeric`, `uuid`, and `json` columns.
371
+ ### Capture `numeric`, `uuid`, `json`, and `bytes` columns.
372
372
  self._persist_new_json_columns(df, debug=debug)
373
373
  self._persist_new_numeric_columns(df, debug=debug)
374
374
  self._persist_new_uuid_columns(df, debug=debug)
375
+ self._persist_new_bytes_columns(df, debug=debug)
375
376
 
376
377
  if debug:
377
378
  dprint(
@@ -617,11 +618,13 @@ def filter_existing(
617
618
  filter_unseen_df,
618
619
  add_missing_cols_to_df,
619
620
  get_unhashable_cols,
620
- get_numeric_cols,
621
621
  )
622
622
  from meerschaum.utils.dtypes import (
623
623
  to_pandas_dtype,
624
624
  none_if_null,
625
+ to_datetime,
626
+ are_dtypes_equal,
627
+ value_is_null,
625
628
  )
626
629
  from meerschaum.config import get_config
627
630
  pd = import_pandas()
@@ -669,29 +672,36 @@ def filter_existing(
669
672
  ### begin is the oldest data in the new dataframe
670
673
  begin, end = None, None
671
674
  dt_col = pipe_columns.get('datetime', None)
675
+ primary_key = pipe_columns.get('primary', None)
672
676
  dt_type = self.dtypes.get(dt_col, 'datetime64[ns, UTC]') if dt_col else None
677
+
678
+ if autoincrement and primary_key == dt_col and dt_col not in df.columns:
679
+ if enforce_dtypes:
680
+ df = self.enforce_dtypes(df, chunksize=chunksize, debug=debug)
681
+ return df, get_empty_df(), df
682
+
673
683
  try:
674
- min_dt_val = df[dt_col].min(skipna=True) if dt_col else None
684
+ min_dt_val = df[dt_col].min(skipna=True) if dt_col and dt_col in df.columns else None
675
685
  if is_dask and min_dt_val is not None:
676
686
  min_dt_val = min_dt_val.compute()
677
687
  min_dt = (
678
- pandas.to_datetime(min_dt_val).to_pydatetime()
679
- if min_dt_val is not None and 'datetime' in str(dt_type)
688
+ to_datetime(min_dt_val, as_pydatetime=True)
689
+ if min_dt_val is not None and are_dtypes_equal(dt_type, 'datetime')
680
690
  else min_dt_val
681
691
  )
682
692
  except Exception:
683
693
  min_dt = None
684
- if not ('datetime' in str(type(min_dt))) or str(min_dt) == 'NaT':
685
- if 'int' not in str(type(min_dt)).lower():
694
+
695
+ if not are_dtypes_equal('datetime', str(type(min_dt))) or value_is_null(min_dt):
696
+ if not are_dtypes_equal('int', str(type(min_dt))):
686
697
  min_dt = None
687
698
 
688
699
  if isinstance(min_dt, datetime):
689
- begin = (
690
- round_time(
691
- min_dt,
692
- to='down'
693
- ) - timedelta(minutes=1)
694
- )
700
+ rounded_min_dt = round_time(min_dt, to='down')
701
+ try:
702
+ begin = rounded_min_dt - timedelta(minutes=1)
703
+ except OverflowError:
704
+ begin = rounded_min_dt
695
705
  elif dt_type and 'int' in dt_type.lower():
696
706
  begin = min_dt
697
707
  elif dt_col is None:
@@ -699,11 +709,11 @@ def filter_existing(
699
709
 
700
710
  ### end is the newest data in the new dataframe
701
711
  try:
702
- max_dt_val = df[dt_col].max(skipna=True) if dt_col else None
712
+ max_dt_val = df[dt_col].max(skipna=True) if dt_col and dt_col in df.columns else None
703
713
  if is_dask and max_dt_val is not None:
704
714
  max_dt_val = max_dt_val.compute()
705
715
  max_dt = (
706
- pandas.to_datetime(max_dt_val).to_pydatetime()
716
+ to_datetime(max_dt_val, as_pydatetime=True)
707
717
  if max_dt_val is not None and 'datetime' in str(dt_type)
708
718
  else max_dt_val
709
719
  )
@@ -712,8 +722,8 @@ def filter_existing(
712
722
  traceback.print_exc()
713
723
  max_dt = None
714
724
 
715
- if ('datetime' not in str(type(max_dt))) or str(min_dt) == 'NaT':
716
- if 'int' not in str(type(max_dt)).lower():
725
+ if not are_dtypes_equal('datetime', str(type(max_dt))) or value_is_null(max_dt):
726
+ if not are_dtypes_equal('int', str(type(max_dt))):
717
727
  max_dt = None
718
728
 
719
729
  if isinstance(max_dt, datetime):
@@ -723,7 +733,7 @@ def filter_existing(
723
733
  to='down'
724
734
  ) + timedelta(minutes=1)
725
735
  )
726
- elif dt_type and 'int' in dt_type.lower():
736
+ elif dt_type and 'int' in dt_type.lower() and max_dt is not None:
727
737
  end = max_dt + 1
728
738
 
729
739
  if max_dt is not None and min_dt is not None and min_dt > max_dt:
@@ -738,7 +748,7 @@ def filter_existing(
738
748
 
739
749
  unique_index_vals = {
740
750
  col: df[col].unique()
741
- for col in pipe_columns
751
+ for col in (pipe_columns if not primary_key else [primary_key])
742
752
  if col in df.columns and col != dt_col
743
753
  } if not date_bound_only else {}
744
754
  filter_params_index_limit = get_config('pipes', 'sync', 'filter_params_index_limit')
@@ -777,14 +787,15 @@ def filter_existing(
777
787
 
778
788
  ### Separate new rows from changed ones.
779
789
  on_cols = [
780
- col for col_key, col in pipe_columns.items()
790
+ col
791
+ for col_key, col in pipe_columns.items()
781
792
  if (
782
793
  col
783
794
  and
784
795
  col_key != 'value'
785
796
  and col in backtrack_df.columns
786
797
  )
787
- ]
798
+ ] if not primary_key else [primary_key]
788
799
  self_dtypes = self.dtypes
789
800
  on_cols_dtypes = {
790
801
  col: to_pandas_dtype(typ)
@@ -1020,3 +1031,32 @@ def _persist_new_json_columns(self, df, debug: bool = False) -> SuccessTuple:
1020
1031
  return edit_success, edit_msg
1021
1032
 
1022
1033
  return True, "Success"
1034
+
1035
+
1036
+ def _persist_new_bytes_columns(self, df, debug: bool = False) -> SuccessTuple:
1037
+ """
1038
+ Check for new `bytes` columns and update the parameters.
1039
+ """
1040
+ from meerschaum.utils.dataframe import get_bytes_cols
1041
+ bytes_cols = get_bytes_cols(df)
1042
+ existing_bytes_cols = [col for col, typ in self.dtypes.items() if typ == 'bytes']
1043
+ new_bytes_cols = [col for col in bytes_cols if col not in existing_bytes_cols]
1044
+ if not new_bytes_cols:
1045
+ return True, "Success"
1046
+
1047
+ self._attributes_sync_time = None
1048
+ dt_col = self.columns.get('datetime', None)
1049
+ dtypes = self.parameters.get('dtypes', {})
1050
+ if dt_col not in dtypes:
1051
+ dtypes[dt_col] = 'datetime'
1052
+ dtypes.update({col: 'bytes' for col in bytes_cols})
1053
+ self.parameters['dtypes'] = dtypes
1054
+
1055
+ if not self.temporary:
1056
+ edit_success, edit_msg = self.edit(interactive=False, debug=debug)
1057
+ if not edit_success:
1058
+ warn(f"Unable to update bytes dtypes for {self}:\n{edit_msg}")
1059
+
1060
+ return edit_success, edit_msg
1061
+
1062
+ return True, "Success"