meerschaum 2.4.5__py3-none-any.whl → 2.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/_internal/docs/index.py +1 -0
- meerschaum/actions/show.py +2 -1
- meerschaum/actions/sql.py +11 -11
- meerschaum/api/dash/pipes.py +4 -2
- meerschaum/api/routes/_pipes.py +3 -8
- meerschaum/config/_version.py +1 -1
- meerschaum/connectors/api/_pipes.py +4 -4
- meerschaum/connectors/sql/_SQLConnector.py +12 -2
- meerschaum/connectors/sql/_create_engine.py +13 -6
- meerschaum/connectors/sql/_pipes.py +81 -65
- meerschaum/connectors/sql/_sql.py +194 -106
- meerschaum/connectors/valkey/_ValkeyConnector.py +2 -5
- meerschaum/core/Pipe/__init__.py +1 -0
- meerschaum/core/Pipe/_attributes.py +1 -1
- meerschaum/core/Pipe/_data.py +16 -16
- meerschaum/core/Pipe/_deduplicate.py +27 -27
- meerschaum/core/Pipe/_sync.py +26 -1
- meerschaum/core/Pipe/_verify.py +5 -5
- meerschaum/utils/dataframe.py +127 -8
- meerschaum/utils/dtypes/__init__.py +26 -4
- meerschaum/utils/dtypes/sql.py +30 -0
- meerschaum/utils/misc.py +1 -1
- meerschaum/utils/sql.py +100 -64
- meerschaum/utils/yaml.py +3 -6
- {meerschaum-2.4.5.dist-info → meerschaum-2.4.7.dist-info}/METADATA +1 -1
- {meerschaum-2.4.5.dist-info → meerschaum-2.4.7.dist-info}/RECORD +32 -32
- {meerschaum-2.4.5.dist-info → meerschaum-2.4.7.dist-info}/LICENSE +0 -0
- {meerschaum-2.4.5.dist-info → meerschaum-2.4.7.dist-info}/NOTICE +0 -0
- {meerschaum-2.4.5.dist-info → meerschaum-2.4.7.dist-info}/WHEEL +0 -0
- {meerschaum-2.4.5.dist-info → meerschaum-2.4.7.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.4.5.dist-info → meerschaum-2.4.7.dist-info}/top_level.txt +0 -0
- {meerschaum-2.4.5.dist-info → meerschaum-2.4.7.dist-info}/zip-safe +0 -0
@@ -20,32 +20,33 @@ _bulk_flavors = {'postgresql', 'timescaledb', 'citus'}
|
|
20
20
|
_disallow_chunks_flavors = {'duckdb', 'mssql'}
|
21
21
|
_max_chunks_flavors = {'sqlite': 1000,}
|
22
22
|
|
23
|
+
|
23
24
|
def read(
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
25
|
+
self,
|
26
|
+
query_or_table: Union[str, sqlalchemy.Query],
|
27
|
+
params: Union[Dict[str, Any], List[str], None] = None,
|
28
|
+
dtype: Optional[Dict[str, Any]] = None,
|
29
|
+
coerce_float: bool = True,
|
30
|
+
chunksize: Optional[int] = -1,
|
31
|
+
workers: Optional[int] = None,
|
32
|
+
chunk_hook: Optional[Callable[[pandas.DataFrame], Any]] = None,
|
33
|
+
as_hook_results: bool = False,
|
34
|
+
chunks: Optional[int] = None,
|
35
|
+
schema: Optional[str] = None,
|
36
|
+
as_chunks: bool = False,
|
37
|
+
as_iterator: bool = False,
|
38
|
+
as_dask: bool = False,
|
39
|
+
index_col: Optional[str] = None,
|
40
|
+
silent: bool = False,
|
41
|
+
debug: bool = False,
|
42
|
+
**kw: Any
|
43
|
+
) -> Union[
|
44
|
+
pandas.DataFrame,
|
45
|
+
dask.DataFrame,
|
46
|
+
List[pandas.DataFrame],
|
47
|
+
List[Any],
|
48
|
+
None,
|
49
|
+
]:
|
49
50
|
"""
|
50
51
|
Read a SQL query or table into a pandas dataframe.
|
51
52
|
|
@@ -145,7 +146,7 @@ def read(
|
|
145
146
|
if chunksize is None and as_iterator:
|
146
147
|
if not silent and self.flavor not in _disallow_chunks_flavors:
|
147
148
|
warn(
|
148
|
-
|
149
|
+
"An iterator may only be generated if chunksize is not None.\n"
|
149
150
|
+ "Falling back to a chunksize of 1000.", stacklevel=3,
|
150
151
|
)
|
151
152
|
chunksize = 1000
|
@@ -386,12 +387,12 @@ def read(
|
|
386
387
|
|
387
388
|
|
388
389
|
def value(
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
390
|
+
self,
|
391
|
+
query: str,
|
392
|
+
*args: Any,
|
393
|
+
use_pandas: bool = False,
|
394
|
+
**kw: Any
|
395
|
+
) -> Any:
|
395
396
|
"""
|
396
397
|
Execute the provided query and return the first value.
|
397
398
|
|
@@ -424,18 +425,22 @@ def value(
|
|
424
425
|
if use_pandas:
|
425
426
|
try:
|
426
427
|
return self.read(query, *args, **kw).iloc[0, 0]
|
427
|
-
except Exception
|
428
|
+
except Exception:
|
428
429
|
return None
|
429
430
|
|
430
431
|
_close = kw.get('close', True)
|
431
432
|
_commit = kw.get('commit', (self.flavor != 'mssql'))
|
433
|
+
|
434
|
+
# _close = True
|
435
|
+
# _commit = True
|
436
|
+
|
432
437
|
try:
|
433
438
|
result, connection = self.exec(
|
434
439
|
query,
|
435
440
|
*args,
|
436
|
-
with_connection
|
437
|
-
close
|
438
|
-
commit
|
441
|
+
with_connection=True,
|
442
|
+
close=False,
|
443
|
+
commit=_commit,
|
439
444
|
**kw
|
440
445
|
)
|
441
446
|
first = result.first() if result is not None else None
|
@@ -452,10 +457,10 @@ def value(
|
|
452
457
|
|
453
458
|
|
454
459
|
def execute(
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
460
|
+
self,
|
461
|
+
*args : Any,
|
462
|
+
**kw : Any
|
463
|
+
) -> Optional[sqlalchemy.engine.result.resultProxy]:
|
459
464
|
"""
|
460
465
|
An alias for `meerschaum.connectors.sql.SQLConnector.exec`.
|
461
466
|
"""
|
@@ -463,22 +468,22 @@ def execute(
|
|
463
468
|
|
464
469
|
|
465
470
|
def exec(
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
471
|
+
self,
|
472
|
+
query: str,
|
473
|
+
*args: Any,
|
474
|
+
silent: bool = False,
|
475
|
+
debug: bool = False,
|
476
|
+
commit: Optional[bool] = None,
|
477
|
+
close: Optional[bool] = None,
|
478
|
+
with_connection: bool = False,
|
479
|
+
**kw: Any
|
480
|
+
) -> Union[
|
481
|
+
sqlalchemy.engine.result.resultProxy,
|
482
|
+
sqlalchemy.engine.cursor.LegacyCursorResult,
|
483
|
+
Tuple[sqlalchemy.engine.result.resultProxy, sqlalchemy.engine.base.Connection],
|
484
|
+
Tuple[sqlalchemy.engine.cursor.LegacyCursorResult, sqlalchemy.engine.base.Connection],
|
485
|
+
None
|
486
|
+
]:
|
482
487
|
"""
|
483
488
|
Execute SQL code and return the `sqlalchemy` result, e.g. when calling stored procedures.
|
484
489
|
|
@@ -492,7 +497,7 @@ def exec(
|
|
492
497
|
|
493
498
|
args: Any
|
494
499
|
Arguments passed to `sqlalchemy.engine.execute`.
|
495
|
-
|
500
|
+
|
496
501
|
silent: bool, default False
|
497
502
|
If `True`, suppress warnings.
|
498
503
|
|
@@ -509,7 +514,7 @@ def exec(
|
|
509
514
|
with_connection: bool, default False
|
510
515
|
If `True`, return a tuple including the connection object.
|
511
516
|
This does not apply if `query` is a list of strings.
|
512
|
-
|
517
|
+
|
513
518
|
Returns
|
514
519
|
-------
|
515
520
|
The `sqlalchemy` result object, or a tuple with the connection if `with_connection` is provided.
|
@@ -519,8 +524,8 @@ def exec(
|
|
519
524
|
return self.exec_queries(
|
520
525
|
list(query),
|
521
526
|
*args,
|
522
|
-
silent
|
523
|
-
debug
|
527
|
+
silent=silent,
|
528
|
+
debug=debug,
|
524
529
|
**kw
|
525
530
|
)
|
526
531
|
|
@@ -538,8 +543,19 @@ def exec(
|
|
538
543
|
if not hasattr(query, 'compile'):
|
539
544
|
query = sqlalchemy.text(query)
|
540
545
|
|
541
|
-
connection = self.
|
542
|
-
|
546
|
+
connection = self.get_connection()
|
547
|
+
|
548
|
+
try:
|
549
|
+
transaction = connection.begin() if _commit else None
|
550
|
+
except sqlalchemy.exc.InvalidRequestError:
|
551
|
+
connection = self.get_connection(rebuild=True)
|
552
|
+
transaction = connection.begin()
|
553
|
+
|
554
|
+
if transaction is not None and not transaction.is_active:
|
555
|
+
connection = self.get_connection(rebuild=True)
|
556
|
+
transaction = connection.begin() if _commit else None
|
557
|
+
|
558
|
+
result = None
|
543
559
|
try:
|
544
560
|
result = connection.execute(query, *args, **kw)
|
545
561
|
if _commit:
|
@@ -552,29 +568,30 @@ def exec(
|
|
552
568
|
result = None
|
553
569
|
if _commit:
|
554
570
|
transaction.rollback()
|
571
|
+
connection = self.get_connection(rebuild=True)
|
555
572
|
finally:
|
556
573
|
if _close:
|
557
574
|
connection.close()
|
558
575
|
|
559
|
-
|
560
|
-
|
576
|
+
if with_connection:
|
577
|
+
return result, connection
|
561
578
|
|
562
579
|
return result
|
563
580
|
|
564
581
|
|
565
582
|
def exec_queries(
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
583
|
+
self,
|
584
|
+
queries: List[
|
585
|
+
Union[
|
586
|
+
str,
|
587
|
+
Tuple[str, Callable[['sqlalchemy.orm.session.Session'], List[str]]]
|
588
|
+
]
|
589
|
+
],
|
590
|
+
break_on_error: bool = False,
|
591
|
+
rollback: bool = True,
|
592
|
+
silent: bool = False,
|
593
|
+
debug: bool = False,
|
594
|
+
) -> List[sqlalchemy.engine.cursor.LegacyCursorResult]:
|
578
595
|
"""
|
579
596
|
Execute a list of queries in a single transaction.
|
580
597
|
|
@@ -624,7 +641,7 @@ def exec_queries(
|
|
624
641
|
|
625
642
|
if debug:
|
626
643
|
dprint(f"[{self}]\n" + str(query))
|
627
|
-
|
644
|
+
|
628
645
|
try:
|
629
646
|
result = session.execute(query)
|
630
647
|
session.flush()
|
@@ -645,9 +662,9 @@ def exec_queries(
|
|
645
662
|
hook_results = self.exec_queries(
|
646
663
|
hook_queries,
|
647
664
|
break_on_error = break_on_error,
|
648
|
-
rollback
|
649
|
-
silent
|
650
|
-
debug
|
665
|
+
rollback=rollback,
|
666
|
+
silent=silent,
|
667
|
+
debug=debug,
|
651
668
|
)
|
652
669
|
result = (result, hook_results)
|
653
670
|
|
@@ -657,20 +674,20 @@ def exec_queries(
|
|
657
674
|
|
658
675
|
|
659
676
|
def to_sql(
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
677
|
+
self,
|
678
|
+
df: pandas.DataFrame,
|
679
|
+
name: str = None,
|
680
|
+
index: bool = False,
|
681
|
+
if_exists: str = 'replace',
|
682
|
+
method: str = "",
|
683
|
+
chunksize: Optional[int] = -1,
|
684
|
+
schema: Optional[str] = None,
|
685
|
+
silent: bool = False,
|
686
|
+
debug: bool = False,
|
687
|
+
as_tuple: bool = False,
|
688
|
+
as_dict: bool = False,
|
689
|
+
**kw
|
690
|
+
) -> Union[bool, SuccessTuple]:
|
674
691
|
"""
|
675
692
|
Upload a DataFrame's contents to the SQL server.
|
676
693
|
|
@@ -708,7 +725,7 @@ def to_sql(
|
|
708
725
|
If `True`, return a dictionary of transaction information.
|
709
726
|
The keys are `success`, `msg`, `start`, `end`, `duration`, `num_rows`, `chunksize`,
|
710
727
|
`method`, and `target`.
|
711
|
-
|
728
|
+
|
712
729
|
kw: Any
|
713
730
|
Additional arguments will be passed to the DataFrame's `to_sql` function
|
714
731
|
|
@@ -737,9 +754,12 @@ def to_sql(
|
|
737
754
|
json_flavors,
|
738
755
|
truncate_item_name,
|
739
756
|
)
|
740
|
-
from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols
|
757
|
+
from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols, get_uuid_cols
|
741
758
|
from meerschaum.utils.dtypes import are_dtypes_equal, quantize_decimal
|
742
|
-
from meerschaum.utils.dtypes.sql import
|
759
|
+
from meerschaum.utils.dtypes.sql import (
|
760
|
+
NUMERIC_PRECISION_FLAVORS,
|
761
|
+
PD_TO_SQLALCHEMY_DTYPES_FLAVORS,
|
762
|
+
)
|
743
763
|
from meerschaum.connectors.sql._create_engine import flavor_configs
|
744
764
|
from meerschaum.utils.packages import attempt_import, import_pandas
|
745
765
|
sqlalchemy = attempt_import('sqlalchemy', debug=debug)
|
@@ -859,6 +879,11 @@ def to_sql(
|
|
859
879
|
)
|
860
880
|
)
|
861
881
|
|
882
|
+
if PD_TO_SQLALCHEMY_DTYPES_FLAVORS['uuid'].get(self.flavor, None) != 'Uuid':
|
883
|
+
uuid_cols = get_uuid_cols(df)
|
884
|
+
for col in uuid_cols:
|
885
|
+
df[col] = df[col].astype(str)
|
886
|
+
|
862
887
|
try:
|
863
888
|
with warnings.catch_warnings():
|
864
889
|
warnings.filterwarnings('ignore', 'case sensitivity issues')
|
@@ -890,12 +915,12 @@ def to_sql(
|
|
890
915
|
|
891
916
|
|
892
917
|
def psql_insert_copy(
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
918
|
+
table: pandas.io.sql.SQLTable,
|
919
|
+
conn: Union[sqlalchemy.engine.Engine, sqlalchemy.engine.Connection],
|
920
|
+
keys: List[str],
|
921
|
+
data_iter: Iterable[Any],
|
922
|
+
schema: Optional[str] = None,
|
923
|
+
) -> None:
|
899
924
|
"""
|
900
925
|
Execute SQL statement inserting data for PostgreSQL.
|
901
926
|
|
@@ -981,8 +1006,71 @@ def format_sql_query_for_dask(query: str) -> 'sqlalchemy.sql.selectable.Select':
|
|
981
1006
|
sqlalchemy_sql = attempt_import("sqlalchemy.sql")
|
982
1007
|
select, text = sqlalchemy_sql.select, sqlalchemy_sql.text
|
983
1008
|
|
984
|
-
parts = query.rsplit('ORDER BY', maxsplit=1)
|
985
1009
|
meta_query = f"SELECT * FROM (\n{query}\n) AS s"
|
986
|
-
# if parts[1]:
|
987
|
-
# meta_query += "\nORDER BY " + parts[1]
|
988
1010
|
return select(text(_remove_leading_select(meta_query)))
|
1011
|
+
|
1012
|
+
|
1013
|
+
def get_connection(self, rebuild: bool = False) -> 'sqlalchemy.engine.base.Connection':
|
1014
|
+
"""
|
1015
|
+
Return the current alive connection.
|
1016
|
+
|
1017
|
+
Parameters
|
1018
|
+
----------
|
1019
|
+
rebuild: bool, default False
|
1020
|
+
If `True`, close the previous connection and open a new one.
|
1021
|
+
|
1022
|
+
Returns
|
1023
|
+
-------
|
1024
|
+
A `sqlalchemy.engine.base.Connection` object.
|
1025
|
+
"""
|
1026
|
+
import threading
|
1027
|
+
if '_thread_connections' not in self.__dict__:
|
1028
|
+
self.__dict__['_thread_connections'] = {}
|
1029
|
+
|
1030
|
+
self._cleanup_connections()
|
1031
|
+
|
1032
|
+
thread_id = threading.get_ident()
|
1033
|
+
|
1034
|
+
thread_connections = self.__dict__.get('_thread_connections', {})
|
1035
|
+
connection = thread_connections.get(thread_id, None)
|
1036
|
+
|
1037
|
+
if rebuild and connection is not None:
|
1038
|
+
try:
|
1039
|
+
connection.close()
|
1040
|
+
except Exception:
|
1041
|
+
pass
|
1042
|
+
|
1043
|
+
_ = thread_connections.pop(thread_id, None)
|
1044
|
+
connection = None
|
1045
|
+
|
1046
|
+
if connection is None or connection.closed:
|
1047
|
+
connection = self.engine.connect()
|
1048
|
+
thread_connections[thread_id] = connection
|
1049
|
+
|
1050
|
+
return connection
|
1051
|
+
|
1052
|
+
|
1053
|
+
def _cleanup_connections(self) -> None:
|
1054
|
+
"""
|
1055
|
+
Remove connections for inactive threads.
|
1056
|
+
"""
|
1057
|
+
import threading
|
1058
|
+
thread_connections = self.__dict__.get('_thread_connections', None)
|
1059
|
+
if not thread_connections:
|
1060
|
+
return
|
1061
|
+
thread_ids = set(thread_connections)
|
1062
|
+
active_threads = [
|
1063
|
+
thread
|
1064
|
+
for thread in threading.enumerate()
|
1065
|
+
if thread.ident in thread_ids
|
1066
|
+
]
|
1067
|
+
active_thread_ids = {thread.ident for thread in active_threads}
|
1068
|
+
inactive_thread_ids = thread_ids - active_thread_ids
|
1069
|
+
for thread_id in inactive_thread_ids:
|
1070
|
+
connection = thread_connections.pop(thread_id, None)
|
1071
|
+
if connection is None:
|
1072
|
+
continue
|
1073
|
+
try:
|
1074
|
+
connection.close()
|
1075
|
+
except Exception:
|
1076
|
+
pass
|
@@ -202,11 +202,8 @@ class ValkeyConnector(Connector):
|
|
202
202
|
-------
|
203
203
|
The current index counter value (how many docs have been pushed).
|
204
204
|
"""
|
205
|
-
|
206
|
-
|
207
|
-
orient='records',
|
208
|
-
date_unit='us',
|
209
|
-
)
|
205
|
+
from meerschaum.utils.dataframe import to_json
|
206
|
+
docs_str = to_json(df)
|
210
207
|
docs = json.loads(docs_str)
|
211
208
|
return self.push_docs(
|
212
209
|
docs,
|
meerschaum/core/Pipe/__init__.py
CHANGED
@@ -298,7 +298,7 @@ def get_val_column(self, debug: bool = False) -> Union[str, None]:
|
|
298
298
|
break
|
299
299
|
if not candidates:
|
300
300
|
if debug:
|
301
|
-
dprint(
|
301
|
+
dprint("No value column could be determined.")
|
302
302
|
return None
|
303
303
|
|
304
304
|
return candidates[0]
|
meerschaum/core/Pipe/_data.py
CHANGED
@@ -574,10 +574,10 @@ def get_rowcount(
|
|
574
574
|
|
575
575
|
|
576
576
|
def get_chunk_interval(
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
577
|
+
self,
|
578
|
+
chunk_interval: Union[timedelta, int, None] = None,
|
579
|
+
debug: bool = False,
|
580
|
+
) -> Union[timedelta, int]:
|
581
581
|
"""
|
582
582
|
Get the chunk interval to use for this pipe.
|
583
583
|
|
@@ -615,18 +615,18 @@ def get_chunk_interval(
|
|
615
615
|
|
616
616
|
|
617
617
|
def get_chunk_bounds(
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
618
|
+
self,
|
619
|
+
begin: Union[datetime, int, None] = None,
|
620
|
+
end: Union[datetime, int, None] = None,
|
621
|
+
bounded: bool = False,
|
622
|
+
chunk_interval: Union[timedelta, int, None] = None,
|
623
|
+
debug: bool = False,
|
624
|
+
) -> List[
|
625
|
+
Tuple[
|
626
|
+
Union[datetime, int, None],
|
627
|
+
Union[datetime, int, None],
|
628
|
+
]
|
629
|
+
]:
|
630
630
|
"""
|
631
631
|
Return a list of datetime bounds for iterating over the pipe's `datetime` axis.
|
632
632
|
|
@@ -12,17 +12,17 @@ from meerschaum.utils.typing import SuccessTuple, Any, Optional, Dict, Tuple, Un
|
|
12
12
|
|
13
13
|
|
14
14
|
def deduplicate(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
15
|
+
self,
|
16
|
+
begin: Union[datetime, int, None] = None,
|
17
|
+
end: Union[datetime, int, None] = None,
|
18
|
+
params: Optional[Dict[str, Any]] = None,
|
19
|
+
chunk_interval: Union[datetime, int, None] = None,
|
20
|
+
bounded: Optional[bool] = None,
|
21
|
+
workers: Optional[int] = None,
|
22
|
+
debug: bool = False,
|
23
|
+
_use_instance_method: bool = True,
|
24
|
+
**kwargs: Any
|
25
|
+
) -> SuccessTuple:
|
26
26
|
"""
|
27
27
|
Call the Pipe's instance connector's `delete_duplicates` method to delete duplicate rows.
|
28
28
|
|
@@ -158,10 +158,10 @@ def deduplicate(
|
|
158
158
|
chunk_msg_body = ""
|
159
159
|
|
160
160
|
full_chunk = self.get_data(
|
161
|
-
begin
|
162
|
-
end
|
163
|
-
params
|
164
|
-
debug
|
161
|
+
begin=chunk_begin,
|
162
|
+
end=chunk_end,
|
163
|
+
params=params,
|
164
|
+
debug=debug,
|
165
165
|
)
|
166
166
|
if full_chunk is None or len(full_chunk) == 0:
|
167
167
|
return bounds, (True, f"{chunk_msg_header}\nChunk is empty, skipping...")
|
@@ -171,10 +171,10 @@ def deduplicate(
|
|
171
171
|
return bounds, (False, f"None of {items_str(indices)} were present in chunk.")
|
172
172
|
try:
|
173
173
|
full_chunk = full_chunk.drop_duplicates(
|
174
|
-
subset
|
175
|
-
keep
|
174
|
+
subset=chunk_indices,
|
175
|
+
keep='last'
|
176
176
|
).reset_index(
|
177
|
-
drop
|
177
|
+
drop=True,
|
178
178
|
)
|
179
179
|
except Exception as e:
|
180
180
|
return (
|
@@ -183,10 +183,10 @@ def deduplicate(
|
|
183
183
|
)
|
184
184
|
|
185
185
|
clear_success, clear_msg = self.clear(
|
186
|
-
begin
|
187
|
-
end
|
188
|
-
params
|
189
|
-
debug
|
186
|
+
begin=chunk_begin,
|
187
|
+
end=chunk_end,
|
188
|
+
params=params,
|
189
|
+
debug=debug,
|
190
190
|
)
|
191
191
|
if not clear_success:
|
192
192
|
chunk_msg_body += f"Failed to clear chunk while deduplicating:\n{clear_msg}\n"
|
@@ -195,13 +195,13 @@ def deduplicate(
|
|
195
195
|
sync_success, sync_msg = self.sync(full_chunk, debug=debug)
|
196
196
|
if not sync_success:
|
197
197
|
chunk_msg_body += f"Failed to sync chunk while deduplicating:\n{sync_msg}\n"
|
198
|
-
|
198
|
+
|
199
199
|
### Finally check if the deduplication worked.
|
200
200
|
chunk_rowcount = self.get_rowcount(
|
201
|
-
begin
|
202
|
-
end
|
203
|
-
params
|
204
|
-
debug
|
201
|
+
begin=chunk_begin,
|
202
|
+
end=chunk_end,
|
203
|
+
params=params,
|
204
|
+
debug=debug,
|
205
205
|
)
|
206
206
|
if chunk_rowcount != deduped_chunk_len:
|
207
207
|
return bounds, (
|
meerschaum/core/Pipe/_sync.py
CHANGED
@@ -367,9 +367,10 @@ def sync(
|
|
367
367
|
### Cast to a dataframe and ensure datatypes are what we expect.
|
368
368
|
df = self.enforce_dtypes(df, chunksize=chunksize, debug=debug)
|
369
369
|
|
370
|
-
### Capture `numeric` and `json` columns.
|
370
|
+
### Capture `numeric`, `uuid`, and `json` columns.
|
371
371
|
self._persist_new_json_columns(df, debug=debug)
|
372
372
|
self._persist_new_numeric_columns(df, debug=debug)
|
373
|
+
self._persist_new_uuid_columns(df, debug=debug)
|
373
374
|
|
374
375
|
if debug:
|
375
376
|
dprint(
|
@@ -928,6 +929,30 @@ def _persist_new_numeric_columns(self, df, debug: bool = False) -> SuccessTuple:
|
|
928
929
|
return True, "Success"
|
929
930
|
|
930
931
|
|
932
|
+
def _persist_new_uuid_columns(self, df, debug: bool = False) -> SuccessTuple:
|
933
|
+
"""
|
934
|
+
Check for new numeric columns and update the parameters.
|
935
|
+
"""
|
936
|
+
from meerschaum.utils.dataframe import get_uuid_cols
|
937
|
+
uuid_cols = get_uuid_cols(df)
|
938
|
+
existing_uuid_cols = [col for col, typ in self.dtypes.items() if typ == 'uuid']
|
939
|
+
new_uuid_cols = [col for col in uuid_cols if col not in existing_uuid_cols]
|
940
|
+
if not new_uuid_cols:
|
941
|
+
return True, "Success"
|
942
|
+
|
943
|
+
dtypes = self.parameters.get('dtypes', {})
|
944
|
+
dtypes.update({col: 'uuid' for col in uuid_cols})
|
945
|
+
self.parameters['dtypes'] = dtypes
|
946
|
+
if not self.temporary:
|
947
|
+
edit_success, edit_msg = self.edit(interactive=False, debug=debug)
|
948
|
+
if not edit_success:
|
949
|
+
warn(f"Unable to update UUID dtypes for {self}:\n{edit_msg}")
|
950
|
+
|
951
|
+
return edit_success, edit_msg
|
952
|
+
|
953
|
+
return True, "Success"
|
954
|
+
|
955
|
+
|
931
956
|
def _persist_new_json_columns(self, df, debug: bool = False) -> SuccessTuple:
|
932
957
|
"""
|
933
958
|
Check for new JSON columns and update the parameters.
|
meerschaum/core/Pipe/_verify.py
CHANGED
@@ -281,9 +281,9 @@ def verify(
|
|
281
281
|
|
282
282
|
|
283
283
|
def get_chunks_success_message(
|
284
|
-
|
285
|
-
|
286
|
-
|
284
|
+
chunk_success_tuples: Dict[Tuple[Any, Any], SuccessTuple],
|
285
|
+
header: str = '',
|
286
|
+
) -> str:
|
287
287
|
"""
|
288
288
|
Sum together all of the inserts and updates from the chunks.
|
289
289
|
|
@@ -323,8 +323,8 @@ def get_chunks_success_message(
|
|
323
323
|
+ ([f'updated {num_updated}'] if num_updated else [])
|
324
324
|
+ ([f'upserted {num_upserted}'] if num_upserted else [])
|
325
325
|
) or ['synced 0'],
|
326
|
-
quotes
|
327
|
-
and_
|
326
|
+
quotes=False,
|
327
|
+
and_=False,
|
328
328
|
)
|
329
329
|
|
330
330
|
success_msg = (
|