meerschaum 2.6.16__py3-none-any.whl → 2.7.0rc1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- meerschaum/_internal/arguments/_parse_arguments.py +1 -1
- meerschaum/actions/delete.py +65 -69
- meerschaum/actions/edit.py +22 -2
- meerschaum/actions/install.py +1 -2
- meerschaum/actions/sync.py +2 -3
- meerschaum/config/_default.py +1 -1
- meerschaum/config/_paths.py +2 -1
- meerschaum/config/_version.py +1 -1
- meerschaum/connectors/api/_pipes.py +4 -3
- meerschaum/connectors/sql/_create_engine.py +3 -3
- meerschaum/connectors/sql/_pipes.py +84 -38
- meerschaum/connectors/sql/_sql.py +6 -1
- meerschaum/connectors/valkey/_pipes.py +12 -1
- meerschaum/core/Pipe/__init__.py +23 -13
- meerschaum/core/Pipe/_attributes.py +19 -0
- meerschaum/core/Pipe/_dtypes.py +1 -1
- meerschaum/core/Pipe/_sync.py +61 -21
- meerschaum/core/Pipe/_verify.py +8 -7
- meerschaum/jobs/_Job.py +2 -1
- meerschaum/plugins/_Plugin.py +11 -14
- meerschaum/utils/daemon/Daemon.py +20 -13
- meerschaum/utils/dataframe.py +175 -13
- meerschaum/utils/dtypes/__init__.py +103 -14
- meerschaum/utils/dtypes/sql.py +26 -0
- meerschaum/utils/misc.py +8 -8
- meerschaum/utils/packages/_packages.py +1 -1
- meerschaum/utils/schedule.py +8 -3
- meerschaum/utils/sql.py +70 -47
- meerschaum/utils/venv/_Venv.py +4 -4
- meerschaum/utils/venv/__init__.py +33 -13
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/METADATA +2 -2
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/RECORD +38 -38
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/LICENSE +0 -0
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/NOTICE +0 -0
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/WHEEL +0 -0
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/top_level.txt +0 -0
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/zip-safe +0 -0
@@ -460,10 +460,16 @@ def get_create_index_queries(
|
|
460
460
|
else None
|
461
461
|
)
|
462
462
|
primary_key_constraint_name = (
|
463
|
-
sql_item_name(f'
|
463
|
+
sql_item_name(f'PK_{pipe.target}', self.flavor, None)
|
464
464
|
if primary_key is not None
|
465
465
|
else None
|
466
466
|
)
|
467
|
+
primary_key_clustered = "CLUSTERED" if _datetime is None else "NONCLUSTERED"
|
468
|
+
datetime_clustered = (
|
469
|
+
"CLUSTERED"
|
470
|
+
if not existing_primary_keys and _datetime is not None
|
471
|
+
else "NONCLUSTERED"
|
472
|
+
)
|
467
473
|
|
468
474
|
_id_index_name = (
|
469
475
|
sql_item_name(index_names['id'], self.flavor, None)
|
@@ -474,6 +480,7 @@ def get_create_index_queries(
|
|
474
480
|
_create_space_partition = get_config('system', 'experimental', 'space')
|
475
481
|
|
476
482
|
### create datetime index
|
483
|
+
dt_query = None
|
477
484
|
if _datetime is not None:
|
478
485
|
if self.flavor == 'timescaledb' and pipe.parameters.get('hypertable', True):
|
479
486
|
_id_count = (
|
@@ -504,19 +511,19 @@ def get_create_index_queries(
|
|
504
511
|
+ 'if_not_exists => true, '
|
505
512
|
+ "migrate_data => true);"
|
506
513
|
)
|
507
|
-
elif
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
)
|
514
|
+
elif _datetime_index_name:
|
515
|
+
if self.flavor == 'mssql':
|
516
|
+
dt_query = (
|
517
|
+
f"CREATE {datetime_clustered} INDEX {_datetime_index_name} "
|
518
|
+
f"ON {_pipe_name} ({_datetime_name})"
|
519
|
+
)
|
520
|
+
else:
|
521
|
+
dt_query = (
|
522
|
+
f"CREATE INDEX {_datetime_index_name} "
|
523
|
+
+ f"ON {_pipe_name} ({_datetime_name})"
|
524
|
+
)
|
519
525
|
|
526
|
+
if dt_query:
|
520
527
|
index_queries[_datetime] = [dt_query]
|
521
528
|
|
522
529
|
primary_queries = []
|
@@ -623,7 +630,7 @@ def get_create_index_queries(
|
|
623
630
|
),
|
624
631
|
(
|
625
632
|
f"ALTER TABLE {_pipe_name}\n"
|
626
|
-
f"ADD CONSTRAINT {primary_key_constraint_name} PRIMARY KEY ({primary_key_name})"
|
633
|
+
f"ADD CONSTRAINT {primary_key_constraint_name} PRIMARY KEY {primary_key_clustered} ({primary_key_name})"
|
627
634
|
),
|
628
635
|
])
|
629
636
|
index_queries[primary_key] = primary_queries
|
@@ -875,6 +882,7 @@ def get_pipe_data(
|
|
875
882
|
from meerschaum.utils.dtypes import (
|
876
883
|
attempt_cast_to_numeric,
|
877
884
|
attempt_cast_to_uuid,
|
885
|
+
attempt_cast_to_bytes,
|
878
886
|
are_dtypes_equal,
|
879
887
|
)
|
880
888
|
from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
|
@@ -891,17 +899,15 @@ def get_pipe_data(
|
|
891
899
|
col: get_pd_type_from_db_type(typ)
|
892
900
|
for col, typ in cols_types.items()
|
893
901
|
}
|
894
|
-
}
|
902
|
+
} if pipe.enforce else {}
|
895
903
|
if dtypes:
|
896
904
|
if self.flavor == 'sqlite':
|
897
905
|
if not pipe.columns.get('datetime', None):
|
898
906
|
_dt = pipe.guess_datetime()
|
899
907
|
dt = sql_item_name(_dt, self.flavor, None) if _dt else None
|
900
|
-
is_guess = True
|
901
908
|
else:
|
902
909
|
_dt = pipe.get_columns('datetime')
|
903
910
|
dt = sql_item_name(_dt, self.flavor, None)
|
904
|
-
is_guess = False
|
905
911
|
|
906
912
|
if _dt:
|
907
913
|
dt_type = dtypes.get(_dt, 'object').lower()
|
@@ -929,7 +935,7 @@ def get_pipe_data(
|
|
929
935
|
col: to_pandas_dtype(typ)
|
930
936
|
for col, typ in dtypes.items()
|
931
937
|
if col in select_columns and col not in (omit_columns or [])
|
932
|
-
}
|
938
|
+
} if pipe.enforce else {}
|
933
939
|
query = self.get_pipe_data_query(
|
934
940
|
pipe,
|
935
941
|
select_columns=select_columns,
|
@@ -959,6 +965,11 @@ def get_pipe_data(
|
|
959
965
|
for col, typ in pipe.dtypes.items()
|
960
966
|
if typ == 'uuid' and col in dtypes
|
961
967
|
]
|
968
|
+
bytes_columns = [
|
969
|
+
col
|
970
|
+
for col, typ in pipe.dtypes.items()
|
971
|
+
if typ == 'bytes' and col in dtypes
|
972
|
+
]
|
962
973
|
|
963
974
|
kw['coerce_float'] = kw.get('coerce_float', (len(numeric_columns) == 0))
|
964
975
|
|
@@ -978,6 +989,11 @@ def get_pipe_data(
|
|
978
989
|
continue
|
979
990
|
df[col] = df[col].apply(attempt_cast_to_uuid)
|
980
991
|
|
992
|
+
for col in bytes_columns:
|
993
|
+
if col not in df.columns:
|
994
|
+
continue
|
995
|
+
df[col] = df[col].apply(attempt_cast_to_bytes)
|
996
|
+
|
981
997
|
if self.flavor == 'sqlite':
|
982
998
|
ignore_dt_cols = [
|
983
999
|
col
|
@@ -1339,7 +1355,13 @@ def create_pipe_table_from_df(
|
|
1339
1355
|
"""
|
1340
1356
|
Create a pipe's table from its configured dtypes and an incoming dataframe.
|
1341
1357
|
"""
|
1342
|
-
from meerschaum.utils.dataframe import
|
1358
|
+
from meerschaum.utils.dataframe import (
|
1359
|
+
get_json_cols,
|
1360
|
+
get_numeric_cols,
|
1361
|
+
get_uuid_cols,
|
1362
|
+
get_datetime_cols,
|
1363
|
+
get_bytes_cols,
|
1364
|
+
)
|
1343
1365
|
from meerschaum.utils.sql import get_create_table_queries, sql_item_name
|
1344
1366
|
primary_key = pipe.columns.get('primary', None)
|
1345
1367
|
dt_col = pipe.columns.get('datetime', None)
|
@@ -1365,6 +1387,18 @@ def create_pipe_table_from_df(
|
|
1365
1387
|
col: 'numeric'
|
1366
1388
|
for col in get_numeric_cols(df)
|
1367
1389
|
},
|
1390
|
+
**{
|
1391
|
+
col: 'bytes'
|
1392
|
+
for col in get_bytes_cols(df)
|
1393
|
+
},
|
1394
|
+
**{
|
1395
|
+
col: 'datetime64[ns, UTC]'
|
1396
|
+
for col in get_datetime_cols(df, timezone_aware=True, timezone_naive=False)
|
1397
|
+
},
|
1398
|
+
**{
|
1399
|
+
col: 'datetime64[ns]'
|
1400
|
+
for col in get_datetime_cols(df, timezone_aware=False, timezone_naive=True)
|
1401
|
+
},
|
1368
1402
|
**pipe.dtypes
|
1369
1403
|
}
|
1370
1404
|
autoincrement = (
|
@@ -1455,11 +1489,9 @@ def sync_pipe(
|
|
1455
1489
|
get_update_queries,
|
1456
1490
|
sql_item_name,
|
1457
1491
|
update_queries,
|
1458
|
-
get_create_table_queries,
|
1459
1492
|
get_reset_autoincrement_queries,
|
1460
1493
|
)
|
1461
1494
|
from meerschaum.utils.misc import generate_password
|
1462
|
-
from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols, get_uuid_cols
|
1463
1495
|
from meerschaum.utils.dtypes import are_dtypes_equal
|
1464
1496
|
from meerschaum.utils.dtypes.sql import get_db_type_from_pd_type
|
1465
1497
|
from meerschaum import Pipe
|
@@ -1572,6 +1604,7 @@ def sync_pipe(
|
|
1572
1604
|
'schema': self.get_pipe_schema(pipe),
|
1573
1605
|
})
|
1574
1606
|
|
1607
|
+
dt_col = pipe.columns.get('datetime', None)
|
1575
1608
|
primary_key = pipe.columns.get('primary', None)
|
1576
1609
|
autoincrement = (
|
1577
1610
|
pipe.parameters.get('autoincrement', False)
|
@@ -1589,17 +1622,23 @@ def sync_pipe(
|
|
1589
1622
|
if not edit_success:
|
1590
1623
|
return edit_success, edit_msg
|
1591
1624
|
|
1592
|
-
|
1625
|
+
def _check_pk(_df_to_clear):
|
1626
|
+
if _df_to_clear is None:
|
1627
|
+
return
|
1628
|
+
if primary_key not in _df_to_clear.columns:
|
1629
|
+
return
|
1630
|
+
if not _df_to_clear[primary_key].notnull().any():
|
1631
|
+
del _df_to_clear[primary_key]
|
1632
|
+
|
1633
|
+
autoincrement_needs_reset = bool(
|
1634
|
+
autoincrement
|
1635
|
+
and primary_key
|
1636
|
+
and primary_key in unseen_df.columns
|
1637
|
+
and unseen_df[primary_key].notnull().any()
|
1638
|
+
)
|
1593
1639
|
if autoincrement and primary_key:
|
1594
|
-
|
1595
|
-
|
1596
|
-
del unseen_df[primary_key]
|
1597
|
-
if update_df is not None and primary_key in update_df.columns:
|
1598
|
-
del update_df[primary_key]
|
1599
|
-
if delta_df is not None and primary_key in delta_df.columns:
|
1600
|
-
del delta_df[primary_key]
|
1601
|
-
elif unseen_df[primary_key].notnull().any():
|
1602
|
-
autoincrement_needs_reset = True
|
1640
|
+
for _df_to_clear in (unseen_df, update_df, delta_df):
|
1641
|
+
_check_pk(_df_to_clear)
|
1603
1642
|
|
1604
1643
|
if is_new:
|
1605
1644
|
create_success, create_msg = self.create_pipe_table_from_df(
|
@@ -1612,6 +1651,7 @@ def sync_pipe(
|
|
1612
1651
|
|
1613
1652
|
do_identity_insert = bool(
|
1614
1653
|
self.flavor in ('mssql',)
|
1654
|
+
and primary_key
|
1615
1655
|
and primary_key in unseen_df.columns
|
1616
1656
|
and autoincrement
|
1617
1657
|
)
|
@@ -1707,7 +1747,11 @@ def sync_pipe(
|
|
1707
1747
|
col
|
1708
1748
|
for col_key, col in pipe.columns.items()
|
1709
1749
|
if col and col in existing_cols
|
1710
|
-
]
|
1750
|
+
] if not primary_key else (
|
1751
|
+
[dt_col, primary_key]
|
1752
|
+
if self.flavor == 'timescaledb' and dt_col and dt_col in update_df.columns
|
1753
|
+
else [primary_key]
|
1754
|
+
)
|
1711
1755
|
update_queries = get_update_queries(
|
1712
1756
|
pipe.target,
|
1713
1757
|
temp_target,
|
@@ -1716,7 +1760,8 @@ def sync_pipe(
|
|
1716
1760
|
upsert=upsert,
|
1717
1761
|
schema=self.get_pipe_schema(pipe),
|
1718
1762
|
patch_schema=self.internal_schema,
|
1719
|
-
datetime_col=
|
1763
|
+
datetime_col=(dt_col if dt_col in update_df.columns else None),
|
1764
|
+
identity_insert=(autoincrement and primary_key in update_df.columns),
|
1720
1765
|
debug=debug,
|
1721
1766
|
)
|
1722
1767
|
update_success = all(
|
@@ -1834,7 +1879,6 @@ def sync_pipe_inplace(
|
|
1834
1879
|
session_execute,
|
1835
1880
|
update_queries,
|
1836
1881
|
)
|
1837
|
-
from meerschaum.utils.dtypes import are_dtypes_equal
|
1838
1882
|
from meerschaum.utils.dtypes.sql import (
|
1839
1883
|
get_pd_type_from_db_type,
|
1840
1884
|
)
|
@@ -2054,6 +2098,7 @@ def sync_pipe_inplace(
|
|
2054
2098
|
) if not (upsert or static) else new_cols_types
|
2055
2099
|
|
2056
2100
|
common_cols = [col for col in new_cols if col in backtrack_cols_types]
|
2101
|
+
primary_key = pipe.columns.get('primary', None)
|
2057
2102
|
on_cols = {
|
2058
2103
|
col: new_cols.get(col)
|
2059
2104
|
for col_key, col in pipe.columns.items()
|
@@ -2064,7 +2109,7 @@ def sync_pipe_inplace(
|
|
2064
2109
|
and col in backtrack_cols_types
|
2065
2110
|
and col in new_cols
|
2066
2111
|
)
|
2067
|
-
}
|
2112
|
+
} if not primary_key else {primary_key: new_cols.get(primary_key)}
|
2068
2113
|
|
2069
2114
|
null_replace_new_cols_str = (
|
2070
2115
|
', '.join([
|
@@ -2591,7 +2636,7 @@ def get_pipe_rowcount(
|
|
2591
2636
|
result = self.value(query, debug=debug, silent=True)
|
2592
2637
|
try:
|
2593
2638
|
return int(result)
|
2594
|
-
except Exception
|
2639
|
+
except Exception:
|
2595
2640
|
return None
|
2596
2641
|
|
2597
2642
|
|
@@ -2616,10 +2661,11 @@ def drop_pipe(
|
|
2616
2661
|
from meerschaum.utils.sql import table_exists, sql_item_name, DROP_IF_EXISTS_FLAVORS
|
2617
2662
|
success = True
|
2618
2663
|
target = pipe.target
|
2664
|
+
schema = self.get_pipe_schema(pipe)
|
2619
2665
|
target_name = (
|
2620
|
-
sql_item_name(target, self.flavor,
|
2666
|
+
sql_item_name(target, self.flavor, schema)
|
2621
2667
|
)
|
2622
|
-
if table_exists(target, self, debug=debug):
|
2668
|
+
if table_exists(target, self, schema=schema, debug=debug):
|
2623
2669
|
if_exists_str = "IF EXISTS" if self.flavor in DROP_IF_EXISTS_FLAVORS else ""
|
2624
2670
|
success = self.exec(
|
2625
2671
|
f"DROP TABLE {if_exists_str} {target_name}", silent=True, debug=debug
|
@@ -790,7 +790,12 @@ def to_sql(
|
|
790
790
|
truncate_item_name,
|
791
791
|
DROP_IF_EXISTS_FLAVORS,
|
792
792
|
)
|
793
|
-
from meerschaum.utils.dataframe import
|
793
|
+
from meerschaum.utils.dataframe import (
|
794
|
+
get_json_cols,
|
795
|
+
get_numeric_cols,
|
796
|
+
get_uuid_cols,
|
797
|
+
get_bytes_cols,
|
798
|
+
)
|
794
799
|
from meerschaum.utils.dtypes import are_dtypes_equal, quantize_decimal, coerce_timezone
|
795
800
|
from meerschaum.utils.dtypes.sql import (
|
796
801
|
NUMERIC_PRECISION_FLAVORS,
|
@@ -46,9 +46,20 @@ def serialize_document(doc: Dict[str, Any]) -> str:
|
|
46
46
|
-------
|
47
47
|
A serialized string for the document.
|
48
48
|
"""
|
49
|
+
from meerschaum.utils.dtypes import serialize_bytes
|
49
50
|
return json.dumps(
|
50
51
|
doc,
|
51
|
-
default=(
|
52
|
+
default=(
|
53
|
+
lambda x: (
|
54
|
+
json_serialize_datetime(x)
|
55
|
+
if hasattr(x, 'tzinfo')
|
56
|
+
else (
|
57
|
+
serialize_bytes(x)
|
58
|
+
if isinstance(x, bytes)
|
59
|
+
else str(x)
|
60
|
+
)
|
61
|
+
)
|
62
|
+
),
|
52
63
|
separators=(',', ':'),
|
53
64
|
sort_keys=True,
|
54
65
|
)
|
meerschaum/core/Pipe/__init__.py
CHANGED
@@ -106,6 +106,7 @@ class Pipe:
|
|
106
106
|
upsert,
|
107
107
|
static,
|
108
108
|
tzinfo,
|
109
|
+
enforce,
|
109
110
|
get_columns,
|
110
111
|
get_columns_types,
|
111
112
|
get_columns_indices,
|
@@ -132,6 +133,7 @@ class Pipe:
|
|
132
133
|
_persist_new_json_columns,
|
133
134
|
_persist_new_numeric_columns,
|
134
135
|
_persist_new_uuid_columns,
|
136
|
+
_persist_new_bytes_columns,
|
135
137
|
)
|
136
138
|
from ._verify import (
|
137
139
|
verify,
|
@@ -162,12 +164,14 @@ class Pipe:
|
|
162
164
|
upsert: Optional[bool] = None,
|
163
165
|
autoincrement: Optional[bool] = None,
|
164
166
|
static: Optional[bool] = None,
|
167
|
+
enforce: Optional[bool] = None,
|
165
168
|
mrsm_instance: Optional[Union[str, InstanceConnector]] = None,
|
166
169
|
cache: bool = False,
|
167
170
|
debug: bool = False,
|
168
171
|
connector_keys: Optional[str] = None,
|
169
172
|
metric_key: Optional[str] = None,
|
170
173
|
location_key: Optional[str] = None,
|
174
|
+
instance_keys: Optional[str] = None,
|
171
175
|
indexes: Union[Dict[str, str], List[str], None] = None,
|
172
176
|
):
|
173
177
|
"""
|
@@ -219,6 +223,10 @@ class Pipe:
|
|
219
223
|
static: Optional[bool], default None
|
220
224
|
If `True`, set `static` in the parameters.
|
221
225
|
|
226
|
+
enforce: Optionanl[bool], default None
|
227
|
+
If `False`, skip data type enforcement.
|
228
|
+
Default behavior is `True`.
|
229
|
+
|
222
230
|
temporary: bool, default False
|
223
231
|
If `True`, prevent instance tables (pipes, users, plugins) from being created.
|
224
232
|
|
@@ -319,11 +327,13 @@ class Pipe:
|
|
319
327
|
if isinstance(static, bool):
|
320
328
|
self._attributes['parameters']['static'] = static
|
321
329
|
|
330
|
+
if isinstance(enforce, bool):
|
331
|
+
self._attributes['parameters']['enforce'] = enforce
|
332
|
+
|
322
333
|
### NOTE: The parameters dictionary is {} by default.
|
323
334
|
### A Pipe may be registered without parameters, then edited,
|
324
335
|
### or a Pipe may be registered with parameters set in-memory first.
|
325
|
-
|
326
|
-
_mrsm_instance = mrsm_instance if mrsm_instance is not None else instance
|
336
|
+
_mrsm_instance = mrsm_instance if mrsm_instance is not None else (instance or instance_keys)
|
327
337
|
if _mrsm_instance is None:
|
328
338
|
_mrsm_instance = get_config('meerschaum', 'instance', patch=True)
|
329
339
|
|
@@ -341,10 +351,10 @@ class Pipe:
|
|
341
351
|
Return the four keys needed to reconstruct this pipe.
|
342
352
|
"""
|
343
353
|
return {
|
344
|
-
'
|
345
|
-
'
|
346
|
-
'
|
347
|
-
'
|
354
|
+
'connector_keys': self.connector_keys,
|
355
|
+
'metric_key': self.metric_key,
|
356
|
+
'location_key': self.location_key,
|
357
|
+
'instance_keys': self.instance_keys,
|
348
358
|
}
|
349
359
|
|
350
360
|
def keys(self) -> List[str]:
|
@@ -385,7 +395,7 @@ class Pipe:
|
|
385
395
|
warnings.simplefilter('ignore')
|
386
396
|
try:
|
387
397
|
conn = parse_instance_keys(self.connector_keys)
|
388
|
-
except Exception
|
398
|
+
except Exception:
|
389
399
|
conn = None
|
390
400
|
if conn:
|
391
401
|
self._connector = conn
|
@@ -429,7 +439,7 @@ class Pipe:
|
|
429
439
|
_fetch_patch = {
|
430
440
|
'fetch': ({
|
431
441
|
'definition': (
|
432
|
-
|
442
|
+
"SELECT * FROM "
|
433
443
|
+ sql_item_name(
|
434
444
|
str(self.target),
|
435
445
|
self.instance_connector.flavor,
|
@@ -467,7 +477,7 @@ class Pipe:
|
|
467
477
|
and self.location_key == other.location_key
|
468
478
|
and self.instance_keys == other.instance_keys
|
469
479
|
)
|
470
|
-
except Exception
|
480
|
+
except Exception:
|
471
481
|
return False
|
472
482
|
|
473
483
|
def __hash__(self):
|
@@ -496,11 +506,11 @@ class Pipe:
|
|
496
506
|
Define the state dictionary (pickling).
|
497
507
|
"""
|
498
508
|
return {
|
499
|
-
'
|
500
|
-
'
|
501
|
-
'
|
509
|
+
'connector_keys': self.connector_keys,
|
510
|
+
'metric_key': self.metric_key,
|
511
|
+
'location_key': self.location_key,
|
502
512
|
'parameters': self.parameters,
|
503
|
-
'
|
513
|
+
'instance_keys': self.instance_keys,
|
504
514
|
}
|
505
515
|
|
506
516
|
def __setstate__(self, _state: Dict[str, Any]):
|
@@ -289,6 +289,25 @@ def tzinfo(self) -> Union[None, timezone]:
|
|
289
289
|
return None
|
290
290
|
|
291
291
|
|
292
|
+
@property
|
293
|
+
def enforce(self) -> bool:
|
294
|
+
"""
|
295
|
+
Return the `enforce` parameter for the pipe.
|
296
|
+
"""
|
297
|
+
if 'enforce' not in self.parameters:
|
298
|
+
self.parameters['enforce'] = True
|
299
|
+
|
300
|
+
return self.parameters['enforce']
|
301
|
+
|
302
|
+
|
303
|
+
@enforce.setter
|
304
|
+
def enforce(self, _enforce: bool) -> None:
|
305
|
+
"""
|
306
|
+
Set the `enforce` parameter for the pipe.
|
307
|
+
"""
|
308
|
+
self.parameters['_enforce'] = _enforce
|
309
|
+
|
310
|
+
|
292
311
|
def get_columns(self, *args: str, error: bool = False) -> Union[str, Tuple[str]]:
|
293
312
|
"""
|
294
313
|
Check if the requested columns are defined.
|
meerschaum/core/Pipe/_dtypes.py
CHANGED
meerschaum/core/Pipe/_sync.py
CHANGED
@@ -368,10 +368,11 @@ def sync(
|
|
368
368
|
### Cast to a dataframe and ensure datatypes are what we expect.
|
369
369
|
df = self.enforce_dtypes(df, chunksize=chunksize, debug=debug)
|
370
370
|
|
371
|
-
### Capture `numeric`, `uuid`, and `
|
371
|
+
### Capture `numeric`, `uuid`, `json`, and `bytes` columns.
|
372
372
|
self._persist_new_json_columns(df, debug=debug)
|
373
373
|
self._persist_new_numeric_columns(df, debug=debug)
|
374
374
|
self._persist_new_uuid_columns(df, debug=debug)
|
375
|
+
self._persist_new_bytes_columns(df, debug=debug)
|
375
376
|
|
376
377
|
if debug:
|
377
378
|
dprint(
|
@@ -617,11 +618,13 @@ def filter_existing(
|
|
617
618
|
filter_unseen_df,
|
618
619
|
add_missing_cols_to_df,
|
619
620
|
get_unhashable_cols,
|
620
|
-
get_numeric_cols,
|
621
621
|
)
|
622
622
|
from meerschaum.utils.dtypes import (
|
623
623
|
to_pandas_dtype,
|
624
624
|
none_if_null,
|
625
|
+
to_datetime,
|
626
|
+
are_dtypes_equal,
|
627
|
+
value_is_null,
|
625
628
|
)
|
626
629
|
from meerschaum.config import get_config
|
627
630
|
pd = import_pandas()
|
@@ -669,29 +672,36 @@ def filter_existing(
|
|
669
672
|
### begin is the oldest data in the new dataframe
|
670
673
|
begin, end = None, None
|
671
674
|
dt_col = pipe_columns.get('datetime', None)
|
675
|
+
primary_key = pipe_columns.get('primary', None)
|
672
676
|
dt_type = self.dtypes.get(dt_col, 'datetime64[ns, UTC]') if dt_col else None
|
677
|
+
|
678
|
+
if autoincrement and primary_key == dt_col and dt_col not in df.columns:
|
679
|
+
if enforce_dtypes:
|
680
|
+
df = self.enforce_dtypes(df, chunksize=chunksize, debug=debug)
|
681
|
+
return df, get_empty_df(), df
|
682
|
+
|
673
683
|
try:
|
674
|
-
min_dt_val = df[dt_col].min(skipna=True) if dt_col else None
|
684
|
+
min_dt_val = df[dt_col].min(skipna=True) if dt_col and dt_col in df.columns else None
|
675
685
|
if is_dask and min_dt_val is not None:
|
676
686
|
min_dt_val = min_dt_val.compute()
|
677
687
|
min_dt = (
|
678
|
-
|
679
|
-
if min_dt_val is not None and 'datetime'
|
688
|
+
to_datetime(min_dt_val, as_pydatetime=True)
|
689
|
+
if min_dt_val is not None and are_dtypes_equal(dt_type, 'datetime')
|
680
690
|
else min_dt_val
|
681
691
|
)
|
682
692
|
except Exception:
|
683
693
|
min_dt = None
|
684
|
-
|
685
|
-
|
694
|
+
|
695
|
+
if not are_dtypes_equal('datetime', str(type(min_dt))) or value_is_null(min_dt):
|
696
|
+
if not are_dtypes_equal('int', str(type(min_dt))):
|
686
697
|
min_dt = None
|
687
698
|
|
688
699
|
if isinstance(min_dt, datetime):
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
)
|
700
|
+
rounded_min_dt = round_time(min_dt, to='down')
|
701
|
+
try:
|
702
|
+
begin = rounded_min_dt - timedelta(minutes=1)
|
703
|
+
except OverflowError:
|
704
|
+
begin = rounded_min_dt
|
695
705
|
elif dt_type and 'int' in dt_type.lower():
|
696
706
|
begin = min_dt
|
697
707
|
elif dt_col is None:
|
@@ -699,11 +709,11 @@ def filter_existing(
|
|
699
709
|
|
700
710
|
### end is the newest data in the new dataframe
|
701
711
|
try:
|
702
|
-
max_dt_val = df[dt_col].max(skipna=True) if dt_col else None
|
712
|
+
max_dt_val = df[dt_col].max(skipna=True) if dt_col and dt_col in df.columns else None
|
703
713
|
if is_dask and max_dt_val is not None:
|
704
714
|
max_dt_val = max_dt_val.compute()
|
705
715
|
max_dt = (
|
706
|
-
|
716
|
+
to_datetime(max_dt_val, as_pydatetime=True)
|
707
717
|
if max_dt_val is not None and 'datetime' in str(dt_type)
|
708
718
|
else max_dt_val
|
709
719
|
)
|
@@ -712,8 +722,8 @@ def filter_existing(
|
|
712
722
|
traceback.print_exc()
|
713
723
|
max_dt = None
|
714
724
|
|
715
|
-
if ('datetime'
|
716
|
-
if 'int'
|
725
|
+
if not are_dtypes_equal('datetime', str(type(max_dt))) or value_is_null(max_dt):
|
726
|
+
if not are_dtypes_equal('int', str(type(max_dt))):
|
717
727
|
max_dt = None
|
718
728
|
|
719
729
|
if isinstance(max_dt, datetime):
|
@@ -723,7 +733,7 @@ def filter_existing(
|
|
723
733
|
to='down'
|
724
734
|
) + timedelta(minutes=1)
|
725
735
|
)
|
726
|
-
elif dt_type and 'int' in dt_type.lower():
|
736
|
+
elif dt_type and 'int' in dt_type.lower() and max_dt is not None:
|
727
737
|
end = max_dt + 1
|
728
738
|
|
729
739
|
if max_dt is not None and min_dt is not None and min_dt > max_dt:
|
@@ -738,7 +748,7 @@ def filter_existing(
|
|
738
748
|
|
739
749
|
unique_index_vals = {
|
740
750
|
col: df[col].unique()
|
741
|
-
for col in pipe_columns
|
751
|
+
for col in (pipe_columns if not primary_key else [primary_key])
|
742
752
|
if col in df.columns and col != dt_col
|
743
753
|
} if not date_bound_only else {}
|
744
754
|
filter_params_index_limit = get_config('pipes', 'sync', 'filter_params_index_limit')
|
@@ -777,14 +787,15 @@ def filter_existing(
|
|
777
787
|
|
778
788
|
### Separate new rows from changed ones.
|
779
789
|
on_cols = [
|
780
|
-
col
|
790
|
+
col
|
791
|
+
for col_key, col in pipe_columns.items()
|
781
792
|
if (
|
782
793
|
col
|
783
794
|
and
|
784
795
|
col_key != 'value'
|
785
796
|
and col in backtrack_df.columns
|
786
797
|
)
|
787
|
-
]
|
798
|
+
] if not primary_key else [primary_key]
|
788
799
|
self_dtypes = self.dtypes
|
789
800
|
on_cols_dtypes = {
|
790
801
|
col: to_pandas_dtype(typ)
|
@@ -1020,3 +1031,32 @@ def _persist_new_json_columns(self, df, debug: bool = False) -> SuccessTuple:
|
|
1020
1031
|
return edit_success, edit_msg
|
1021
1032
|
|
1022
1033
|
return True, "Success"
|
1034
|
+
|
1035
|
+
|
1036
|
+
def _persist_new_bytes_columns(self, df, debug: bool = False) -> SuccessTuple:
|
1037
|
+
"""
|
1038
|
+
Check for new `bytes` columns and update the parameters.
|
1039
|
+
"""
|
1040
|
+
from meerschaum.utils.dataframe import get_bytes_cols
|
1041
|
+
bytes_cols = get_bytes_cols(df)
|
1042
|
+
existing_bytes_cols = [col for col, typ in self.dtypes.items() if typ == 'bytes']
|
1043
|
+
new_bytes_cols = [col for col in bytes_cols if col not in existing_bytes_cols]
|
1044
|
+
if not new_bytes_cols:
|
1045
|
+
return True, "Success"
|
1046
|
+
|
1047
|
+
self._attributes_sync_time = None
|
1048
|
+
dt_col = self.columns.get('datetime', None)
|
1049
|
+
dtypes = self.parameters.get('dtypes', {})
|
1050
|
+
if dt_col not in dtypes:
|
1051
|
+
dtypes[dt_col] = 'datetime'
|
1052
|
+
dtypes.update({col: 'bytes' for col in bytes_cols})
|
1053
|
+
self.parameters['dtypes'] = dtypes
|
1054
|
+
|
1055
|
+
if not self.temporary:
|
1056
|
+
edit_success, edit_msg = self.edit(interactive=False, debug=debug)
|
1057
|
+
if not edit_success:
|
1058
|
+
warn(f"Unable to update bytes dtypes for {self}:\n{edit_msg}")
|
1059
|
+
|
1060
|
+
return edit_success, edit_msg
|
1061
|
+
|
1062
|
+
return True, "Success"
|