meerschaum 2.6.16__py3-none-any.whl → 2.7.0rc1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. meerschaum/_internal/arguments/_parse_arguments.py +1 -1
  2. meerschaum/actions/delete.py +65 -69
  3. meerschaum/actions/edit.py +22 -2
  4. meerschaum/actions/install.py +1 -2
  5. meerschaum/actions/sync.py +2 -3
  6. meerschaum/config/_default.py +1 -1
  7. meerschaum/config/_paths.py +2 -1
  8. meerschaum/config/_version.py +1 -1
  9. meerschaum/connectors/api/_pipes.py +4 -3
  10. meerschaum/connectors/sql/_create_engine.py +3 -3
  11. meerschaum/connectors/sql/_pipes.py +84 -38
  12. meerschaum/connectors/sql/_sql.py +6 -1
  13. meerschaum/connectors/valkey/_pipes.py +12 -1
  14. meerschaum/core/Pipe/__init__.py +23 -13
  15. meerschaum/core/Pipe/_attributes.py +19 -0
  16. meerschaum/core/Pipe/_dtypes.py +1 -1
  17. meerschaum/core/Pipe/_sync.py +61 -21
  18. meerschaum/core/Pipe/_verify.py +8 -7
  19. meerschaum/jobs/_Job.py +2 -1
  20. meerschaum/plugins/_Plugin.py +11 -14
  21. meerschaum/utils/daemon/Daemon.py +20 -13
  22. meerschaum/utils/dataframe.py +175 -13
  23. meerschaum/utils/dtypes/__init__.py +103 -14
  24. meerschaum/utils/dtypes/sql.py +26 -0
  25. meerschaum/utils/misc.py +8 -8
  26. meerschaum/utils/packages/_packages.py +1 -1
  27. meerschaum/utils/schedule.py +8 -3
  28. meerschaum/utils/sql.py +70 -47
  29. meerschaum/utils/venv/_Venv.py +4 -4
  30. meerschaum/utils/venv/__init__.py +33 -13
  31. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/METADATA +2 -2
  32. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/RECORD +38 -38
  33. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/LICENSE +0 -0
  34. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/NOTICE +0 -0
  35. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/WHEEL +0 -0
  36. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/entry_points.txt +0 -0
  37. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/top_level.txt +0 -0
  38. {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/zip-safe +0 -0
@@ -460,10 +460,16 @@ def get_create_index_queries(
460
460
  else None
461
461
  )
462
462
  primary_key_constraint_name = (
463
- sql_item_name(f'pk_{pipe.target}', self.flavor, None)
463
+ sql_item_name(f'PK_{pipe.target}', self.flavor, None)
464
464
  if primary_key is not None
465
465
  else None
466
466
  )
467
+ primary_key_clustered = "CLUSTERED" if _datetime is None else "NONCLUSTERED"
468
+ datetime_clustered = (
469
+ "CLUSTERED"
470
+ if not existing_primary_keys and _datetime is not None
471
+ else "NONCLUSTERED"
472
+ )
467
473
 
468
474
  _id_index_name = (
469
475
  sql_item_name(index_names['id'], self.flavor, None)
@@ -474,6 +480,7 @@ def get_create_index_queries(
474
480
  _create_space_partition = get_config('system', 'experimental', 'space')
475
481
 
476
482
  ### create datetime index
483
+ dt_query = None
477
484
  if _datetime is not None:
478
485
  if self.flavor == 'timescaledb' and pipe.parameters.get('hypertable', True):
479
486
  _id_count = (
@@ -504,19 +511,19 @@ def get_create_index_queries(
504
511
  + 'if_not_exists => true, '
505
512
  + "migrate_data => true);"
506
513
  )
507
- elif self.flavor == 'mssql':
508
- dt_query = (
509
- "CREATE "
510
- + ("CLUSTERED " if not primary_key else '')
511
- + f"INDEX {_datetime_index_name} "
512
- + f"ON {_pipe_name} ({_datetime_name})"
513
- )
514
- else: ### mssql, sqlite, etc.
515
- dt_query = (
516
- f"CREATE INDEX {_datetime_index_name} "
517
- + f"ON {_pipe_name} ({_datetime_name})"
518
- )
514
+ elif _datetime_index_name:
515
+ if self.flavor == 'mssql':
516
+ dt_query = (
517
+ f"CREATE {datetime_clustered} INDEX {_datetime_index_name} "
518
+ f"ON {_pipe_name} ({_datetime_name})"
519
+ )
520
+ else:
521
+ dt_query = (
522
+ f"CREATE INDEX {_datetime_index_name} "
523
+ + f"ON {_pipe_name} ({_datetime_name})"
524
+ )
519
525
 
526
+ if dt_query:
520
527
  index_queries[_datetime] = [dt_query]
521
528
 
522
529
  primary_queries = []
@@ -623,7 +630,7 @@ def get_create_index_queries(
623
630
  ),
624
631
  (
625
632
  f"ALTER TABLE {_pipe_name}\n"
626
- f"ADD CONSTRAINT {primary_key_constraint_name} PRIMARY KEY ({primary_key_name})"
633
+ f"ADD CONSTRAINT {primary_key_constraint_name} PRIMARY KEY {primary_key_clustered} ({primary_key_name})"
627
634
  ),
628
635
  ])
629
636
  index_queries[primary_key] = primary_queries
@@ -875,6 +882,7 @@ def get_pipe_data(
875
882
  from meerschaum.utils.dtypes import (
876
883
  attempt_cast_to_numeric,
877
884
  attempt_cast_to_uuid,
885
+ attempt_cast_to_bytes,
878
886
  are_dtypes_equal,
879
887
  )
880
888
  from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
@@ -891,17 +899,15 @@ def get_pipe_data(
891
899
  col: get_pd_type_from_db_type(typ)
892
900
  for col, typ in cols_types.items()
893
901
  }
894
- }
902
+ } if pipe.enforce else {}
895
903
  if dtypes:
896
904
  if self.flavor == 'sqlite':
897
905
  if not pipe.columns.get('datetime', None):
898
906
  _dt = pipe.guess_datetime()
899
907
  dt = sql_item_name(_dt, self.flavor, None) if _dt else None
900
- is_guess = True
901
908
  else:
902
909
  _dt = pipe.get_columns('datetime')
903
910
  dt = sql_item_name(_dt, self.flavor, None)
904
- is_guess = False
905
911
 
906
912
  if _dt:
907
913
  dt_type = dtypes.get(_dt, 'object').lower()
@@ -929,7 +935,7 @@ def get_pipe_data(
929
935
  col: to_pandas_dtype(typ)
930
936
  for col, typ in dtypes.items()
931
937
  if col in select_columns and col not in (omit_columns or [])
932
- }
938
+ } if pipe.enforce else {}
933
939
  query = self.get_pipe_data_query(
934
940
  pipe,
935
941
  select_columns=select_columns,
@@ -959,6 +965,11 @@ def get_pipe_data(
959
965
  for col, typ in pipe.dtypes.items()
960
966
  if typ == 'uuid' and col in dtypes
961
967
  ]
968
+ bytes_columns = [
969
+ col
970
+ for col, typ in pipe.dtypes.items()
971
+ if typ == 'bytes' and col in dtypes
972
+ ]
962
973
 
963
974
  kw['coerce_float'] = kw.get('coerce_float', (len(numeric_columns) == 0))
964
975
 
@@ -978,6 +989,11 @@ def get_pipe_data(
978
989
  continue
979
990
  df[col] = df[col].apply(attempt_cast_to_uuid)
980
991
 
992
+ for col in bytes_columns:
993
+ if col not in df.columns:
994
+ continue
995
+ df[col] = df[col].apply(attempt_cast_to_bytes)
996
+
981
997
  if self.flavor == 'sqlite':
982
998
  ignore_dt_cols = [
983
999
  col
@@ -1339,7 +1355,13 @@ def create_pipe_table_from_df(
1339
1355
  """
1340
1356
  Create a pipe's table from its configured dtypes and an incoming dataframe.
1341
1357
  """
1342
- from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols, get_uuid_cols
1358
+ from meerschaum.utils.dataframe import (
1359
+ get_json_cols,
1360
+ get_numeric_cols,
1361
+ get_uuid_cols,
1362
+ get_datetime_cols,
1363
+ get_bytes_cols,
1364
+ )
1343
1365
  from meerschaum.utils.sql import get_create_table_queries, sql_item_name
1344
1366
  primary_key = pipe.columns.get('primary', None)
1345
1367
  dt_col = pipe.columns.get('datetime', None)
@@ -1365,6 +1387,18 @@ def create_pipe_table_from_df(
1365
1387
  col: 'numeric'
1366
1388
  for col in get_numeric_cols(df)
1367
1389
  },
1390
+ **{
1391
+ col: 'bytes'
1392
+ for col in get_bytes_cols(df)
1393
+ },
1394
+ **{
1395
+ col: 'datetime64[ns, UTC]'
1396
+ for col in get_datetime_cols(df, timezone_aware=True, timezone_naive=False)
1397
+ },
1398
+ **{
1399
+ col: 'datetime64[ns]'
1400
+ for col in get_datetime_cols(df, timezone_aware=False, timezone_naive=True)
1401
+ },
1368
1402
  **pipe.dtypes
1369
1403
  }
1370
1404
  autoincrement = (
@@ -1455,11 +1489,9 @@ def sync_pipe(
1455
1489
  get_update_queries,
1456
1490
  sql_item_name,
1457
1491
  update_queries,
1458
- get_create_table_queries,
1459
1492
  get_reset_autoincrement_queries,
1460
1493
  )
1461
1494
  from meerschaum.utils.misc import generate_password
1462
- from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols, get_uuid_cols
1463
1495
  from meerschaum.utils.dtypes import are_dtypes_equal
1464
1496
  from meerschaum.utils.dtypes.sql import get_db_type_from_pd_type
1465
1497
  from meerschaum import Pipe
@@ -1572,6 +1604,7 @@ def sync_pipe(
1572
1604
  'schema': self.get_pipe_schema(pipe),
1573
1605
  })
1574
1606
 
1607
+ dt_col = pipe.columns.get('datetime', None)
1575
1608
  primary_key = pipe.columns.get('primary', None)
1576
1609
  autoincrement = (
1577
1610
  pipe.parameters.get('autoincrement', False)
@@ -1589,17 +1622,23 @@ def sync_pipe(
1589
1622
  if not edit_success:
1590
1623
  return edit_success, edit_msg
1591
1624
 
1592
- autoincrement_needs_reset = False
1625
+ def _check_pk(_df_to_clear):
1626
+ if _df_to_clear is None:
1627
+ return
1628
+ if primary_key not in _df_to_clear.columns:
1629
+ return
1630
+ if not _df_to_clear[primary_key].notnull().any():
1631
+ del _df_to_clear[primary_key]
1632
+
1633
+ autoincrement_needs_reset = bool(
1634
+ autoincrement
1635
+ and primary_key
1636
+ and primary_key in unseen_df.columns
1637
+ and unseen_df[primary_key].notnull().any()
1638
+ )
1593
1639
  if autoincrement and primary_key:
1594
- if primary_key not in df.columns:
1595
- if unseen_df is not None and primary_key in unseen_df.columns:
1596
- del unseen_df[primary_key]
1597
- if update_df is not None and primary_key in update_df.columns:
1598
- del update_df[primary_key]
1599
- if delta_df is not None and primary_key in delta_df.columns:
1600
- del delta_df[primary_key]
1601
- elif unseen_df[primary_key].notnull().any():
1602
- autoincrement_needs_reset = True
1640
+ for _df_to_clear in (unseen_df, update_df, delta_df):
1641
+ _check_pk(_df_to_clear)
1603
1642
 
1604
1643
  if is_new:
1605
1644
  create_success, create_msg = self.create_pipe_table_from_df(
@@ -1612,6 +1651,7 @@ def sync_pipe(
1612
1651
 
1613
1652
  do_identity_insert = bool(
1614
1653
  self.flavor in ('mssql',)
1654
+ and primary_key
1615
1655
  and primary_key in unseen_df.columns
1616
1656
  and autoincrement
1617
1657
  )
@@ -1707,7 +1747,11 @@ def sync_pipe(
1707
1747
  col
1708
1748
  for col_key, col in pipe.columns.items()
1709
1749
  if col and col in existing_cols
1710
- ]
1750
+ ] if not primary_key else (
1751
+ [dt_col, primary_key]
1752
+ if self.flavor == 'timescaledb' and dt_col and dt_col in update_df.columns
1753
+ else [primary_key]
1754
+ )
1711
1755
  update_queries = get_update_queries(
1712
1756
  pipe.target,
1713
1757
  temp_target,
@@ -1716,7 +1760,8 @@ def sync_pipe(
1716
1760
  upsert=upsert,
1717
1761
  schema=self.get_pipe_schema(pipe),
1718
1762
  patch_schema=self.internal_schema,
1719
- datetime_col=pipe.columns.get('datetime', None),
1763
+ datetime_col=(dt_col if dt_col in update_df.columns else None),
1764
+ identity_insert=(autoincrement and primary_key in update_df.columns),
1720
1765
  debug=debug,
1721
1766
  )
1722
1767
  update_success = all(
@@ -1834,7 +1879,6 @@ def sync_pipe_inplace(
1834
1879
  session_execute,
1835
1880
  update_queries,
1836
1881
  )
1837
- from meerschaum.utils.dtypes import are_dtypes_equal
1838
1882
  from meerschaum.utils.dtypes.sql import (
1839
1883
  get_pd_type_from_db_type,
1840
1884
  )
@@ -2054,6 +2098,7 @@ def sync_pipe_inplace(
2054
2098
  ) if not (upsert or static) else new_cols_types
2055
2099
 
2056
2100
  common_cols = [col for col in new_cols if col in backtrack_cols_types]
2101
+ primary_key = pipe.columns.get('primary', None)
2057
2102
  on_cols = {
2058
2103
  col: new_cols.get(col)
2059
2104
  for col_key, col in pipe.columns.items()
@@ -2064,7 +2109,7 @@ def sync_pipe_inplace(
2064
2109
  and col in backtrack_cols_types
2065
2110
  and col in new_cols
2066
2111
  )
2067
- }
2112
+ } if not primary_key else {primary_key: new_cols.get(primary_key)}
2068
2113
 
2069
2114
  null_replace_new_cols_str = (
2070
2115
  ', '.join([
@@ -2591,7 +2636,7 @@ def get_pipe_rowcount(
2591
2636
  result = self.value(query, debug=debug, silent=True)
2592
2637
  try:
2593
2638
  return int(result)
2594
- except Exception as e:
2639
+ except Exception:
2595
2640
  return None
2596
2641
 
2597
2642
 
@@ -2616,10 +2661,11 @@ def drop_pipe(
2616
2661
  from meerschaum.utils.sql import table_exists, sql_item_name, DROP_IF_EXISTS_FLAVORS
2617
2662
  success = True
2618
2663
  target = pipe.target
2664
+ schema = self.get_pipe_schema(pipe)
2619
2665
  target_name = (
2620
- sql_item_name(target, self.flavor, self.get_pipe_schema(pipe))
2666
+ sql_item_name(target, self.flavor, schema)
2621
2667
  )
2622
- if table_exists(target, self, debug=debug):
2668
+ if table_exists(target, self, schema=schema, debug=debug):
2623
2669
  if_exists_str = "IF EXISTS" if self.flavor in DROP_IF_EXISTS_FLAVORS else ""
2624
2670
  success = self.exec(
2625
2671
  f"DROP TABLE {if_exists_str} {target_name}", silent=True, debug=debug
@@ -790,7 +790,12 @@ def to_sql(
790
790
  truncate_item_name,
791
791
  DROP_IF_EXISTS_FLAVORS,
792
792
  )
793
- from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols, get_uuid_cols
793
+ from meerschaum.utils.dataframe import (
794
+ get_json_cols,
795
+ get_numeric_cols,
796
+ get_uuid_cols,
797
+ get_bytes_cols,
798
+ )
794
799
  from meerschaum.utils.dtypes import are_dtypes_equal, quantize_decimal, coerce_timezone
795
800
  from meerschaum.utils.dtypes.sql import (
796
801
  NUMERIC_PRECISION_FLAVORS,
@@ -46,9 +46,20 @@ def serialize_document(doc: Dict[str, Any]) -> str:
46
46
  -------
47
47
  A serialized string for the document.
48
48
  """
49
+ from meerschaum.utils.dtypes import serialize_bytes
49
50
  return json.dumps(
50
51
  doc,
51
- default=(lambda x: json_serialize_datetime(x) if hasattr(x, 'tzinfo') else str(x)),
52
+ default=(
53
+ lambda x: (
54
+ json_serialize_datetime(x)
55
+ if hasattr(x, 'tzinfo')
56
+ else (
57
+ serialize_bytes(x)
58
+ if isinstance(x, bytes)
59
+ else str(x)
60
+ )
61
+ )
62
+ ),
52
63
  separators=(',', ':'),
53
64
  sort_keys=True,
54
65
  )
@@ -106,6 +106,7 @@ class Pipe:
106
106
  upsert,
107
107
  static,
108
108
  tzinfo,
109
+ enforce,
109
110
  get_columns,
110
111
  get_columns_types,
111
112
  get_columns_indices,
@@ -132,6 +133,7 @@ class Pipe:
132
133
  _persist_new_json_columns,
133
134
  _persist_new_numeric_columns,
134
135
  _persist_new_uuid_columns,
136
+ _persist_new_bytes_columns,
135
137
  )
136
138
  from ._verify import (
137
139
  verify,
@@ -162,12 +164,14 @@ class Pipe:
162
164
  upsert: Optional[bool] = None,
163
165
  autoincrement: Optional[bool] = None,
164
166
  static: Optional[bool] = None,
167
+ enforce: Optional[bool] = None,
165
168
  mrsm_instance: Optional[Union[str, InstanceConnector]] = None,
166
169
  cache: bool = False,
167
170
  debug: bool = False,
168
171
  connector_keys: Optional[str] = None,
169
172
  metric_key: Optional[str] = None,
170
173
  location_key: Optional[str] = None,
174
+ instance_keys: Optional[str] = None,
171
175
  indexes: Union[Dict[str, str], List[str], None] = None,
172
176
  ):
173
177
  """
@@ -219,6 +223,10 @@ class Pipe:
219
223
  static: Optional[bool], default None
220
224
  If `True`, set `static` in the parameters.
221
225
 
226
+ enforce: Optionanl[bool], default None
227
+ If `False`, skip data type enforcement.
228
+ Default behavior is `True`.
229
+
222
230
  temporary: bool, default False
223
231
  If `True`, prevent instance tables (pipes, users, plugins) from being created.
224
232
 
@@ -319,11 +327,13 @@ class Pipe:
319
327
  if isinstance(static, bool):
320
328
  self._attributes['parameters']['static'] = static
321
329
 
330
+ if isinstance(enforce, bool):
331
+ self._attributes['parameters']['enforce'] = enforce
332
+
322
333
  ### NOTE: The parameters dictionary is {} by default.
323
334
  ### A Pipe may be registered without parameters, then edited,
324
335
  ### or a Pipe may be registered with parameters set in-memory first.
325
- # from meerschaum.config import get_config
326
- _mrsm_instance = mrsm_instance if mrsm_instance is not None else instance
336
+ _mrsm_instance = mrsm_instance if mrsm_instance is not None else (instance or instance_keys)
327
337
  if _mrsm_instance is None:
328
338
  _mrsm_instance = get_config('meerschaum', 'instance', patch=True)
329
339
 
@@ -341,10 +351,10 @@ class Pipe:
341
351
  Return the four keys needed to reconstruct this pipe.
342
352
  """
343
353
  return {
344
- 'connector': self.connector_keys,
345
- 'metric': self.metric_key,
346
- 'location': self.location_key,
347
- 'instance': self.instance_keys,
354
+ 'connector_keys': self.connector_keys,
355
+ 'metric_key': self.metric_key,
356
+ 'location_key': self.location_key,
357
+ 'instance_keys': self.instance_keys,
348
358
  }
349
359
 
350
360
  def keys(self) -> List[str]:
@@ -385,7 +395,7 @@ class Pipe:
385
395
  warnings.simplefilter('ignore')
386
396
  try:
387
397
  conn = parse_instance_keys(self.connector_keys)
388
- except Exception as e:
398
+ except Exception:
389
399
  conn = None
390
400
  if conn:
391
401
  self._connector = conn
@@ -429,7 +439,7 @@ class Pipe:
429
439
  _fetch_patch = {
430
440
  'fetch': ({
431
441
  'definition': (
432
- f"SELECT * FROM "
442
+ "SELECT * FROM "
433
443
  + sql_item_name(
434
444
  str(self.target),
435
445
  self.instance_connector.flavor,
@@ -467,7 +477,7 @@ class Pipe:
467
477
  and self.location_key == other.location_key
468
478
  and self.instance_keys == other.instance_keys
469
479
  )
470
- except Exception as e:
480
+ except Exception:
471
481
  return False
472
482
 
473
483
  def __hash__(self):
@@ -496,11 +506,11 @@ class Pipe:
496
506
  Define the state dictionary (pickling).
497
507
  """
498
508
  return {
499
- 'connector': self.connector_keys,
500
- 'metric': self.metric_key,
501
- 'location': self.location_key,
509
+ 'connector_keys': self.connector_keys,
510
+ 'metric_key': self.metric_key,
511
+ 'location_key': self.location_key,
502
512
  'parameters': self.parameters,
503
- 'instance': self.instance_keys,
513
+ 'instance_keys': self.instance_keys,
504
514
  }
505
515
 
506
516
  def __setstate__(self, _state: Dict[str, Any]):
@@ -289,6 +289,25 @@ def tzinfo(self) -> Union[None, timezone]:
289
289
  return None
290
290
 
291
291
 
292
+ @property
293
+ def enforce(self) -> bool:
294
+ """
295
+ Return the `enforce` parameter for the pipe.
296
+ """
297
+ if 'enforce' not in self.parameters:
298
+ self.parameters['enforce'] = True
299
+
300
+ return self.parameters['enforce']
301
+
302
+
303
+ @enforce.setter
304
+ def enforce(self, _enforce: bool) -> None:
305
+ """
306
+ Set the `enforce` parameter for the pipe.
307
+ """
308
+ self.parameters['_enforce'] = _enforce
309
+
310
+
292
311
  def get_columns(self, *args: str, error: bool = False) -> Union[str, Tuple[str]]:
293
312
  """
294
313
  Check if the requested columns are defined.
@@ -41,7 +41,7 @@ def enforce_dtypes(
41
41
  )
42
42
  return df
43
43
 
44
- pipe_dtypes = self.dtypes
44
+ pipe_dtypes = self.dtypes if self.enforce else {}
45
45
 
46
46
  try:
47
47
  if isinstance(df, str):
@@ -368,10 +368,11 @@ def sync(
368
368
  ### Cast to a dataframe and ensure datatypes are what we expect.
369
369
  df = self.enforce_dtypes(df, chunksize=chunksize, debug=debug)
370
370
 
371
- ### Capture `numeric`, `uuid`, and `json` columns.
371
+ ### Capture `numeric`, `uuid`, `json`, and `bytes` columns.
372
372
  self._persist_new_json_columns(df, debug=debug)
373
373
  self._persist_new_numeric_columns(df, debug=debug)
374
374
  self._persist_new_uuid_columns(df, debug=debug)
375
+ self._persist_new_bytes_columns(df, debug=debug)
375
376
 
376
377
  if debug:
377
378
  dprint(
@@ -617,11 +618,13 @@ def filter_existing(
617
618
  filter_unseen_df,
618
619
  add_missing_cols_to_df,
619
620
  get_unhashable_cols,
620
- get_numeric_cols,
621
621
  )
622
622
  from meerschaum.utils.dtypes import (
623
623
  to_pandas_dtype,
624
624
  none_if_null,
625
+ to_datetime,
626
+ are_dtypes_equal,
627
+ value_is_null,
625
628
  )
626
629
  from meerschaum.config import get_config
627
630
  pd = import_pandas()
@@ -669,29 +672,36 @@ def filter_existing(
669
672
  ### begin is the oldest data in the new dataframe
670
673
  begin, end = None, None
671
674
  dt_col = pipe_columns.get('datetime', None)
675
+ primary_key = pipe_columns.get('primary', None)
672
676
  dt_type = self.dtypes.get(dt_col, 'datetime64[ns, UTC]') if dt_col else None
677
+
678
+ if autoincrement and primary_key == dt_col and dt_col not in df.columns:
679
+ if enforce_dtypes:
680
+ df = self.enforce_dtypes(df, chunksize=chunksize, debug=debug)
681
+ return df, get_empty_df(), df
682
+
673
683
  try:
674
- min_dt_val = df[dt_col].min(skipna=True) if dt_col else None
684
+ min_dt_val = df[dt_col].min(skipna=True) if dt_col and dt_col in df.columns else None
675
685
  if is_dask and min_dt_val is not None:
676
686
  min_dt_val = min_dt_val.compute()
677
687
  min_dt = (
678
- pandas.to_datetime(min_dt_val).to_pydatetime()
679
- if min_dt_val is not None and 'datetime' in str(dt_type)
688
+ to_datetime(min_dt_val, as_pydatetime=True)
689
+ if min_dt_val is not None and are_dtypes_equal(dt_type, 'datetime')
680
690
  else min_dt_val
681
691
  )
682
692
  except Exception:
683
693
  min_dt = None
684
- if not ('datetime' in str(type(min_dt))) or str(min_dt) == 'NaT':
685
- if 'int' not in str(type(min_dt)).lower():
694
+
695
+ if not are_dtypes_equal('datetime', str(type(min_dt))) or value_is_null(min_dt):
696
+ if not are_dtypes_equal('int', str(type(min_dt))):
686
697
  min_dt = None
687
698
 
688
699
  if isinstance(min_dt, datetime):
689
- begin = (
690
- round_time(
691
- min_dt,
692
- to='down'
693
- ) - timedelta(minutes=1)
694
- )
700
+ rounded_min_dt = round_time(min_dt, to='down')
701
+ try:
702
+ begin = rounded_min_dt - timedelta(minutes=1)
703
+ except OverflowError:
704
+ begin = rounded_min_dt
695
705
  elif dt_type and 'int' in dt_type.lower():
696
706
  begin = min_dt
697
707
  elif dt_col is None:
@@ -699,11 +709,11 @@ def filter_existing(
699
709
 
700
710
  ### end is the newest data in the new dataframe
701
711
  try:
702
- max_dt_val = df[dt_col].max(skipna=True) if dt_col else None
712
+ max_dt_val = df[dt_col].max(skipna=True) if dt_col and dt_col in df.columns else None
703
713
  if is_dask and max_dt_val is not None:
704
714
  max_dt_val = max_dt_val.compute()
705
715
  max_dt = (
706
- pandas.to_datetime(max_dt_val).to_pydatetime()
716
+ to_datetime(max_dt_val, as_pydatetime=True)
707
717
  if max_dt_val is not None and 'datetime' in str(dt_type)
708
718
  else max_dt_val
709
719
  )
@@ -712,8 +722,8 @@ def filter_existing(
712
722
  traceback.print_exc()
713
723
  max_dt = None
714
724
 
715
- if ('datetime' not in str(type(max_dt))) or str(min_dt) == 'NaT':
716
- if 'int' not in str(type(max_dt)).lower():
725
+ if not are_dtypes_equal('datetime', str(type(max_dt))) or value_is_null(max_dt):
726
+ if not are_dtypes_equal('int', str(type(max_dt))):
717
727
  max_dt = None
718
728
 
719
729
  if isinstance(max_dt, datetime):
@@ -723,7 +733,7 @@ def filter_existing(
723
733
  to='down'
724
734
  ) + timedelta(minutes=1)
725
735
  )
726
- elif dt_type and 'int' in dt_type.lower():
736
+ elif dt_type and 'int' in dt_type.lower() and max_dt is not None:
727
737
  end = max_dt + 1
728
738
 
729
739
  if max_dt is not None and min_dt is not None and min_dt > max_dt:
@@ -738,7 +748,7 @@ def filter_existing(
738
748
 
739
749
  unique_index_vals = {
740
750
  col: df[col].unique()
741
- for col in pipe_columns
751
+ for col in (pipe_columns if not primary_key else [primary_key])
742
752
  if col in df.columns and col != dt_col
743
753
  } if not date_bound_only else {}
744
754
  filter_params_index_limit = get_config('pipes', 'sync', 'filter_params_index_limit')
@@ -777,14 +787,15 @@ def filter_existing(
777
787
 
778
788
  ### Separate new rows from changed ones.
779
789
  on_cols = [
780
- col for col_key, col in pipe_columns.items()
790
+ col
791
+ for col_key, col in pipe_columns.items()
781
792
  if (
782
793
  col
783
794
  and
784
795
  col_key != 'value'
785
796
  and col in backtrack_df.columns
786
797
  )
787
- ]
798
+ ] if not primary_key else [primary_key]
788
799
  self_dtypes = self.dtypes
789
800
  on_cols_dtypes = {
790
801
  col: to_pandas_dtype(typ)
@@ -1020,3 +1031,32 @@ def _persist_new_json_columns(self, df, debug: bool = False) -> SuccessTuple:
1020
1031
  return edit_success, edit_msg
1021
1032
 
1022
1033
  return True, "Success"
1034
+
1035
+
1036
+ def _persist_new_bytes_columns(self, df, debug: bool = False) -> SuccessTuple:
1037
+ """
1038
+ Check for new `bytes` columns and update the parameters.
1039
+ """
1040
+ from meerschaum.utils.dataframe import get_bytes_cols
1041
+ bytes_cols = get_bytes_cols(df)
1042
+ existing_bytes_cols = [col for col, typ in self.dtypes.items() if typ == 'bytes']
1043
+ new_bytes_cols = [col for col in bytes_cols if col not in existing_bytes_cols]
1044
+ if not new_bytes_cols:
1045
+ return True, "Success"
1046
+
1047
+ self._attributes_sync_time = None
1048
+ dt_col = self.columns.get('datetime', None)
1049
+ dtypes = self.parameters.get('dtypes', {})
1050
+ if dt_col not in dtypes:
1051
+ dtypes[dt_col] = 'datetime'
1052
+ dtypes.update({col: 'bytes' for col in bytes_cols})
1053
+ self.parameters['dtypes'] = dtypes
1054
+
1055
+ if not self.temporary:
1056
+ edit_success, edit_msg = self.edit(interactive=False, debug=debug)
1057
+ if not edit_success:
1058
+ warn(f"Unable to update bytes dtypes for {self}:\n{edit_msg}")
1059
+
1060
+ return edit_success, edit_msg
1061
+
1062
+ return True, "Success"