meerschaum 2.7.10__py3-none-any.whl → 2.8.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- meerschaum/_internal/arguments/_parser.py +17 -5
- meerschaum/actions/copy.py +3 -1
- meerschaum/actions/index.py +1 -1
- meerschaum/actions/show.py +7 -7
- meerschaum/actions/sync.py +5 -1
- meerschaum/actions/verify.py +18 -2
- meerschaum/api/__init__.py +77 -41
- meerschaum/api/_exceptions.py +18 -0
- meerschaum/api/dash/__init__.py +4 -2
- meerschaum/api/dash/callbacks/dashboard.py +30 -1
- meerschaum/api/dash/components.py +2 -2
- meerschaum/api/dash/webterm.py +23 -4
- meerschaum/api/models/_pipes.py +8 -8
- meerschaum/api/resources/static/css/dash.css +2 -2
- meerschaum/api/resources/templates/termpage.html +5 -1
- meerschaum/api/routes/__init__.py +15 -12
- meerschaum/api/routes/_connectors.py +30 -28
- meerschaum/api/routes/_index.py +16 -7
- meerschaum/api/routes/_misc.py +30 -22
- meerschaum/api/routes/_pipes.py +244 -148
- meerschaum/api/routes/_plugins.py +58 -47
- meerschaum/api/routes/_users.py +39 -31
- meerschaum/api/routes/_version.py +8 -10
- meerschaum/config/_default.py +10 -0
- meerschaum/config/_version.py +1 -1
- meerschaum/config/static/__init__.py +4 -1
- meerschaum/connectors/api/_APIConnector.py +4 -3
- meerschaum/connectors/api/_login.py +21 -17
- meerschaum/connectors/api/_pipes.py +1 -0
- meerschaum/connectors/api/_request.py +9 -10
- meerschaum/connectors/sql/_cli.py +11 -3
- meerschaum/connectors/sql/_instance.py +1 -1
- meerschaum/connectors/sql/_pipes.py +77 -57
- meerschaum/connectors/sql/_sql.py +26 -9
- meerschaum/core/Pipe/__init__.py +2 -0
- meerschaum/core/Pipe/_attributes.py +13 -2
- meerschaum/core/Pipe/_data.py +85 -0
- meerschaum/core/Pipe/_deduplicate.py +6 -8
- meerschaum/core/Pipe/_sync.py +63 -30
- meerschaum/core/Pipe/_verify.py +243 -77
- meerschaum/core/User/__init__.py +2 -6
- meerschaum/jobs/_Job.py +1 -1
- meerschaum/jobs/__init__.py +15 -0
- meerschaum/utils/dataframe.py +2 -0
- meerschaum/utils/dtypes/sql.py +26 -0
- meerschaum/utils/formatting/_pipes.py +1 -1
- meerschaum/utils/misc.py +11 -7
- meerschaum/utils/packages/_packages.py +1 -1
- meerschaum/utils/sql.py +6 -2
- {meerschaum-2.7.10.dist-info → meerschaum-2.8.1.dist-info}/METADATA +4 -4
- {meerschaum-2.7.10.dist-info → meerschaum-2.8.1.dist-info}/RECORD +57 -56
- {meerschaum-2.7.10.dist-info → meerschaum-2.8.1.dist-info}/LICENSE +0 -0
- {meerschaum-2.7.10.dist-info → meerschaum-2.8.1.dist-info}/NOTICE +0 -0
- {meerschaum-2.7.10.dist-info → meerschaum-2.8.1.dist-info}/WHEEL +0 -0
- {meerschaum-2.7.10.dist-info → meerschaum-2.8.1.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.7.10.dist-info → meerschaum-2.8.1.dist-info}/top_level.txt +0 -0
- {meerschaum-2.7.10.dist-info → meerschaum-2.8.1.dist-info}/zip-safe +0 -0
@@ -7,6 +7,7 @@ Interact with Pipes metadata via SQLConnector.
|
|
7
7
|
"""
|
8
8
|
from __future__ import annotations
|
9
9
|
from datetime import datetime, date, timedelta
|
10
|
+
|
10
11
|
import meerschaum as mrsm
|
11
12
|
from meerschaum.utils.typing import (
|
12
13
|
Union, Any, SuccessTuple, Tuple, Dict, Optional, List
|
@@ -1837,7 +1838,7 @@ def sync_pipe(
|
|
1837
1838
|
and primary_key in unseen_df.columns
|
1838
1839
|
and autoincrement
|
1839
1840
|
)
|
1840
|
-
stats = {'success': True, 'msg': '
|
1841
|
+
stats = {'success': True, 'msg': ''}
|
1841
1842
|
if len(unseen_df) > 0:
|
1842
1843
|
with self.engine.connect() as connection:
|
1843
1844
|
with connection.begin():
|
@@ -1949,6 +1950,7 @@ def sync_pipe(
|
|
1949
1950
|
datetime_col=(dt_col if dt_col in update_df.columns else None),
|
1950
1951
|
identity_insert=(autoincrement and primary_key in update_df.columns),
|
1951
1952
|
null_indices=pipe.null_indices,
|
1953
|
+
cast_columns=pipe.enforce,
|
1952
1954
|
debug=debug,
|
1953
1955
|
)
|
1954
1956
|
update_results = self.exec_queries(
|
@@ -1967,12 +1969,16 @@ def sync_pipe(
|
|
1967
1969
|
if not update_success:
|
1968
1970
|
warn(f"Failed to apply update to {pipe}.")
|
1969
1971
|
stats['success'] = stats['success'] and update_success
|
1970
|
-
stats['msg'] = (
|
1972
|
+
stats['msg'] = (
|
1973
|
+
(stats.get('msg', '') + f'\nFailed to apply update to {pipe}.').lstrip()
|
1974
|
+
if not update_success
|
1975
|
+
else stats.get('msg', '')
|
1976
|
+
)
|
1971
1977
|
|
1972
1978
|
stop = time.perf_counter()
|
1973
1979
|
success = stats['success']
|
1974
1980
|
if not success:
|
1975
|
-
return success, stats['msg']
|
1981
|
+
return success, stats['msg'] or str(stats)
|
1976
1982
|
|
1977
1983
|
unseen_count = len(unseen_df.index) if unseen_df is not None else 0
|
1978
1984
|
update_count = len(update_df.index) if update_df is not None else 0
|
@@ -2529,6 +2535,7 @@ def sync_pipe_inplace(
|
|
2529
2535
|
datetime_col=pipe.columns.get('datetime', None),
|
2530
2536
|
flavor=self.flavor,
|
2531
2537
|
null_indices=pipe.null_indices,
|
2538
|
+
cast_columns=pipe.enforce,
|
2532
2539
|
debug=debug,
|
2533
2540
|
)
|
2534
2541
|
if on_cols else []
|
@@ -2585,6 +2592,7 @@ def get_sync_time(
|
|
2585
2592
|
pipe: 'mrsm.Pipe',
|
2586
2593
|
params: Optional[Dict[str, Any]] = None,
|
2587
2594
|
newest: bool = True,
|
2595
|
+
remote: bool = False,
|
2588
2596
|
debug: bool = False,
|
2589
2597
|
) -> Union[datetime, int, None]:
|
2590
2598
|
"""Get a Pipe's most recent datetime value.
|
@@ -2602,50 +2610,76 @@ def get_sync_time(
|
|
2602
2610
|
If `True`, get the most recent datetime (honoring `params`).
|
2603
2611
|
If `False`, get the oldest datetime (ASC instead of DESC).
|
2604
2612
|
|
2613
|
+
remote: bool, default False
|
2614
|
+
If `True`, return the sync time for the remote fetch definition.
|
2615
|
+
|
2605
2616
|
Returns
|
2606
2617
|
-------
|
2607
2618
|
A `datetime` object (or `int` if using an integer axis) if the pipe exists, otherwise `None`.
|
2608
2619
|
"""
|
2609
|
-
from meerschaum.utils.sql import sql_item_name, build_where
|
2610
|
-
|
2620
|
+
from meerschaum.utils.sql import sql_item_name, build_where, wrap_query_with_cte
|
2621
|
+
src_name = sql_item_name('src', self.flavor)
|
2622
|
+
table_name = sql_item_name(pipe.target, self.flavor, self.get_pipe_schema(pipe))
|
2611
2623
|
|
2612
2624
|
dt_col = pipe.columns.get('datetime', None)
|
2613
2625
|
if dt_col is None:
|
2614
2626
|
return None
|
2615
2627
|
dt_col_name = sql_item_name(dt_col, self.flavor, None)
|
2616
2628
|
|
2629
|
+
if remote and pipe.connector.type != 'sql':
|
2630
|
+
warn(f"Cannot get the remote sync time for {pipe}.")
|
2631
|
+
return None
|
2632
|
+
|
2617
2633
|
ASC_or_DESC = "DESC" if newest else "ASC"
|
2618
2634
|
existing_cols = pipe.get_columns_types(debug=debug)
|
2619
2635
|
valid_params = {}
|
2620
2636
|
if params is not None:
|
2621
2637
|
valid_params = {k: v for k, v in params.items() if k in existing_cols}
|
2638
|
+
flavor = self.flavor if not remote else pipe.connector.flavor
|
2622
2639
|
|
2623
2640
|
### If no bounds are provided for the datetime column,
|
2624
2641
|
### add IS NOT NULL to the WHERE clause.
|
2625
2642
|
if dt_col not in valid_params:
|
2626
2643
|
valid_params[dt_col] = '_None'
|
2627
2644
|
where = "" if not valid_params else build_where(valid_params, self)
|
2628
|
-
|
2645
|
+
src_query = (
|
2646
|
+
f"SELECT {dt_col_name}\nFROM {table_name}{where}"
|
2647
|
+
if not remote
|
2648
|
+
else self.get_pipe_metadef(pipe, params=params, begin=None, end=None)
|
2649
|
+
)
|
2650
|
+
|
2651
|
+
base_query = (
|
2652
|
+
f"SELECT {dt_col_name}\n"
|
2653
|
+
f"FROM {src_name}{where}\n"
|
2654
|
+
f"ORDER BY {dt_col_name} {ASC_or_DESC}\n"
|
2655
|
+
f"LIMIT 1"
|
2656
|
+
)
|
2629
2657
|
if self.flavor == 'mssql':
|
2630
|
-
|
2658
|
+
base_query = (
|
2659
|
+
f"SELECT TOP 1 {dt_col_name}\n"
|
2660
|
+
f"FROM {src_name}{where}\n"
|
2661
|
+
f"ORDER BY {dt_col_name} {ASC_or_DESC}"
|
2662
|
+
)
|
2631
2663
|
elif self.flavor == 'oracle':
|
2632
|
-
|
2664
|
+
base_query = (
|
2633
2665
|
"SELECT * FROM (\n"
|
2634
|
-
|
2635
|
-
|
2636
|
-
|
2666
|
+
f" SELECT {dt_col_name}\n"
|
2667
|
+
f" FROM {src_name}{where}\n"
|
2668
|
+
f" ORDER BY {dt_col_name} {ASC_or_DESC}\n"
|
2669
|
+
") WHERE ROWNUM = 1"
|
2637
2670
|
)
|
2638
2671
|
|
2672
|
+
query = wrap_query_with_cte(src_query, base_query, flavor)
|
2673
|
+
|
2639
2674
|
try:
|
2640
|
-
db_time = self.value(
|
2675
|
+
db_time = self.value(query, silent=True, debug=debug)
|
2641
2676
|
|
2642
2677
|
### No datetime could be found.
|
2643
2678
|
if db_time is None:
|
2644
2679
|
return None
|
2645
2680
|
### sqlite returns str.
|
2646
2681
|
if isinstance(db_time, str):
|
2647
|
-
|
2648
|
-
dateutil_parser = attempt_import('dateutil.parser')
|
2682
|
+
dateutil_parser = mrsm.attempt_import('dateutil.parser')
|
2649
2683
|
st = dateutil_parser.parse(db_time)
|
2650
2684
|
### Do nothing if a datetime object is returned.
|
2651
2685
|
elif isinstance(db_time, datetime):
|
@@ -2743,7 +2777,7 @@ def get_pipe_rowcount(
|
|
2743
2777
|
An `int` for the number of rows if the `pipe` exists, otherwise `None`.
|
2744
2778
|
|
2745
2779
|
"""
|
2746
|
-
from meerschaum.utils.sql import dateadd_str, sql_item_name, wrap_query_with_cte
|
2780
|
+
from meerschaum.utils.sql import dateadd_str, sql_item_name, wrap_query_with_cte, build_where
|
2747
2781
|
from meerschaum.connectors.sql._fetch import get_pipe_query
|
2748
2782
|
from meerschaum.utils.dtypes.sql import get_db_type_from_pd_type
|
2749
2783
|
if remote:
|
@@ -2755,18 +2789,20 @@ def get_pipe_rowcount(
|
|
2755
2789
|
error(msg)
|
2756
2790
|
return None
|
2757
2791
|
|
2758
|
-
_pipe_name = sql_item_name(pipe.target, self.flavor, self.get_pipe_schema(pipe))
|
2759
2792
|
|
2793
|
+
flavor = self.flavor if not remote else pipe.connector.flavor
|
2794
|
+
conn = self if not remote else pipe.connector
|
2795
|
+
_pipe_name = sql_item_name(pipe.target, flavor, self.get_pipe_schema(pipe))
|
2760
2796
|
dt_col = pipe.columns.get('datetime', None)
|
2761
2797
|
dt_typ = pipe.dtypes.get(dt_col, 'datetime') if dt_col else None
|
2762
|
-
dt_db_type = get_db_type_from_pd_type(dt_typ,
|
2798
|
+
dt_db_type = get_db_type_from_pd_type(dt_typ, flavor) if dt_typ else None
|
2763
2799
|
if not dt_col:
|
2764
2800
|
dt_col = pipe.guess_datetime()
|
2765
|
-
dt_name = sql_item_name(dt_col,
|
2801
|
+
dt_name = sql_item_name(dt_col, flavor, None) if dt_col else None
|
2766
2802
|
is_guess = True
|
2767
2803
|
else:
|
2768
2804
|
dt_col = pipe.get_columns('datetime')
|
2769
|
-
dt_name = sql_item_name(dt_col,
|
2805
|
+
dt_name = sql_item_name(dt_col, flavor, None)
|
2770
2806
|
is_guess = False
|
2771
2807
|
|
2772
2808
|
if begin is not None or end is not None:
|
@@ -2786,32 +2822,15 @@ def get_pipe_rowcount(
|
|
2786
2822
|
)
|
2787
2823
|
|
2788
2824
|
|
2789
|
-
_datetime_name = sql_item_name(
|
2790
|
-
dt_col,
|
2791
|
-
(
|
2792
|
-
pipe.instance_connector.flavor
|
2793
|
-
if not remote
|
2794
|
-
else pipe.connector.flavor
|
2795
|
-
),
|
2796
|
-
None,
|
2797
|
-
)
|
2825
|
+
_datetime_name = sql_item_name(dt_col, flavor)
|
2798
2826
|
_cols_names = [
|
2799
|
-
sql_item_name(
|
2800
|
-
col,
|
2801
|
-
(
|
2802
|
-
pipe.instance_connector.flavor
|
2803
|
-
if not remote
|
2804
|
-
else pipe.connector.flavor
|
2805
|
-
),
|
2806
|
-
None,
|
2807
|
-
)
|
2827
|
+
sql_item_name(col, flavor)
|
2808
2828
|
for col in set(
|
2809
2829
|
(
|
2810
2830
|
[dt_col]
|
2811
2831
|
if dt_col
|
2812
2832
|
else []
|
2813
|
-
)
|
2814
|
-
+ (
|
2833
|
+
) + (
|
2815
2834
|
[]
|
2816
2835
|
if params is None
|
2817
2836
|
else list(params.keys())
|
@@ -2826,34 +2845,33 @@ def get_pipe_rowcount(
|
|
2826
2845
|
if not remote
|
2827
2846
|
else get_pipe_query(pipe)
|
2828
2847
|
)
|
2829
|
-
parent_query = f"SELECT COUNT(*)\nFROM {sql_item_name('src',
|
2830
|
-
query = wrap_query_with_cte(src, parent_query,
|
2848
|
+
parent_query = f"SELECT COUNT(*)\nFROM {sql_item_name('src', flavor)}"
|
2849
|
+
query = wrap_query_with_cte(src, parent_query, flavor)
|
2831
2850
|
if begin is not None or end is not None:
|
2832
2851
|
query += "\nWHERE"
|
2833
2852
|
if begin is not None:
|
2834
2853
|
query += (
|
2835
2854
|
f"\n {dt_name} >= "
|
2836
|
-
+ dateadd_str(
|
2855
|
+
+ dateadd_str(flavor, datepart='minute', number=0, begin=begin, db_type=dt_db_type)
|
2837
2856
|
)
|
2838
2857
|
if end is not None and begin is not None:
|
2839
2858
|
query += "\n AND"
|
2840
2859
|
if end is not None:
|
2841
2860
|
query += (
|
2842
2861
|
f"\n {dt_name} < "
|
2843
|
-
+ dateadd_str(
|
2862
|
+
+ dateadd_str(flavor, datepart='minute', number=0, begin=end, db_type=dt_db_type)
|
2844
2863
|
)
|
2845
2864
|
if params is not None:
|
2846
|
-
from meerschaum.utils.sql import build_where
|
2847
2865
|
existing_cols = pipe.get_columns_types(debug=debug)
|
2848
2866
|
valid_params = {k: v for k, v in params.items() if k in existing_cols}
|
2849
2867
|
if valid_params:
|
2850
|
-
query += build_where(valid_params,
|
2868
|
+
query += build_where(valid_params, conn).replace('WHERE', (
|
2851
2869
|
'AND' if (begin is not None or end is not None)
|
2852
2870
|
else 'WHERE'
|
2853
2871
|
)
|
2854
2872
|
)
|
2855
2873
|
|
2856
|
-
result =
|
2874
|
+
result = conn.value(query, debug=debug, silent=True)
|
2857
2875
|
try:
|
2858
2876
|
return int(result)
|
2859
2877
|
except Exception:
|
@@ -3634,7 +3652,6 @@ def deduplicate_pipe(
|
|
3634
3652
|
if not pipe.exists(debug=debug):
|
3635
3653
|
return False, f"Table {pipe_table_name} does not exist."
|
3636
3654
|
|
3637
|
-
### TODO: Handle deleting duplicates without a datetime axis.
|
3638
3655
|
dt_col = pipe.columns.get('datetime', None)
|
3639
3656
|
cols_types = pipe.get_columns_types(debug=debug)
|
3640
3657
|
existing_cols = pipe.get_columns_types(debug=debug)
|
@@ -3738,9 +3755,8 @@ def deduplicate_pipe(
|
|
3738
3755
|
|
3739
3756
|
session_id = generate_password(3)
|
3740
3757
|
|
3741
|
-
dedup_table =
|
3742
|
-
temp_old_table =
|
3743
|
-
|
3758
|
+
dedup_table = self.get_temporary_target(pipe.target, transact_id=session_id, label='dedup')
|
3759
|
+
temp_old_table = self.get_temporary_target(pipe.target, transact_id=session_id, label='old')
|
3744
3760
|
temp_old_table_name = sql_item_name(temp_old_table, self.flavor, self.get_pipe_schema(pipe))
|
3745
3761
|
|
3746
3762
|
create_temporary_table_query = get_create_table_query(
|
@@ -3753,16 +3769,21 @@ def deduplicate_pipe(
|
|
3753
3769
|
if_exists_str = "IF EXISTS" if self.flavor in DROP_IF_EXISTS_FLAVORS else ""
|
3754
3770
|
alter_queries = flatten_list([
|
3755
3771
|
get_rename_table_queries(
|
3756
|
-
pipe.target,
|
3772
|
+
pipe.target,
|
3773
|
+
temp_old_table,
|
3774
|
+
self.flavor,
|
3775
|
+
schema=self.get_pipe_schema(pipe),
|
3757
3776
|
),
|
3758
3777
|
get_rename_table_queries(
|
3759
|
-
dedup_table,
|
3778
|
+
dedup_table,
|
3779
|
+
pipe.target,
|
3780
|
+
self.flavor,
|
3781
|
+
schema=self.get_pipe_schema(pipe),
|
3760
3782
|
),
|
3761
|
-
f""
|
3762
|
-
DROP TABLE {if_exists_str} {temp_old_table_name}
|
3763
|
-
""",
|
3783
|
+
f"DROP TABLE {if_exists_str} {temp_old_table_name}",
|
3764
3784
|
])
|
3765
3785
|
|
3786
|
+
self._log_temporary_tables_creation(temp_old_table, create=(not pipe.temporary), debug=debug)
|
3766
3787
|
create_temporary_result = self.execute(create_temporary_table_query, debug=debug)
|
3767
3788
|
if create_temporary_result is None:
|
3768
3789
|
return False, f"Failed to deduplicate table {pipe_table_name}."
|
@@ -3794,8 +3815,7 @@ def deduplicate_pipe(
|
|
3794
3815
|
f"\nfrom {old_rowcount:,} to {new_rowcount:,} rows"
|
3795
3816
|
if old_rowcount != new_rowcount
|
3796
3817
|
else ''
|
3797
|
-
)
|
3798
|
-
+ '.'
|
3818
|
+
) + '.'
|
3799
3819
|
)
|
3800
3820
|
if success
|
3801
3821
|
else f"Failed to execute query:\n{fail_query}"
|
@@ -773,7 +773,6 @@ def to_sql(
|
|
773
773
|
"""
|
774
774
|
import time
|
775
775
|
import json
|
776
|
-
from decimal import Decimal
|
777
776
|
from datetime import timedelta
|
778
777
|
from meerschaum.utils.warnings import error, warn
|
779
778
|
import warnings
|
@@ -823,6 +822,7 @@ def to_sql(
|
|
823
822
|
|
824
823
|
bytes_cols = get_bytes_cols(df)
|
825
824
|
numeric_cols = get_numeric_cols(df)
|
825
|
+
### NOTE: This excludes non-numeric serialized Decimals (e.g. SQLite).
|
826
826
|
numeric_cols_dtypes = {
|
827
827
|
col: typ
|
828
828
|
for col, typ in kw.get('dtype', {}).items()
|
@@ -833,6 +833,27 @@ def to_sql(
|
|
833
833
|
|
834
834
|
}
|
835
835
|
numeric_cols.extend([col for col in numeric_cols_dtypes if col not in numeric_cols])
|
836
|
+
numeric_cols_precisions_scales = {
|
837
|
+
col: (
|
838
|
+
(typ.precision, typ.scale)
|
839
|
+
if hasattr(typ, 'precision')
|
840
|
+
else get_numeric_precision_scale(self.flavor)
|
841
|
+
)
|
842
|
+
for col, typ in numeric_cols_dtypes.items()
|
843
|
+
}
|
844
|
+
cols_pd_types = {
|
845
|
+
col: get_pd_type_from_db_type(str(typ))
|
846
|
+
for col, typ in kw.get('dtype', {}).items()
|
847
|
+
}
|
848
|
+
cols_pd_types.update({
|
849
|
+
col: f'numeric[{precision},{scale}]'
|
850
|
+
for col, (precision, scale) in numeric_cols_precisions_scales.items()
|
851
|
+
if precision and scale
|
852
|
+
})
|
853
|
+
cols_db_types = {
|
854
|
+
col: get_db_type_from_pd_type(typ, flavor=self.flavor)
|
855
|
+
for col, typ in cols_pd_types.items()
|
856
|
+
}
|
836
857
|
|
837
858
|
enable_bulk_insert = mrsm.get_config(
|
838
859
|
'system', 'connectors', 'sql', 'bulk_insert'
|
@@ -844,7 +865,7 @@ def to_sql(
|
|
844
865
|
if method == "":
|
845
866
|
if enable_bulk_insert:
|
846
867
|
method = (
|
847
|
-
functools.partial(mssql_insert_json, debug=debug)
|
868
|
+
functools.partial(mssql_insert_json, cols_types=cols_db_types, debug=debug)
|
848
869
|
if self.flavor == 'mssql'
|
849
870
|
else functools.partial(psql_insert_copy, debug=debug)
|
850
871
|
)
|
@@ -867,14 +888,10 @@ def to_sql(
|
|
867
888
|
|
868
889
|
### Check for numeric columns.
|
869
890
|
for col in numeric_cols:
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
(typ.precision, typ.scale)
|
874
|
-
if hasattr(typ, 'precision')
|
875
|
-
else get_numeric_precision_scale(self.flavor)
|
891
|
+
precision, scale = numeric_cols_precisions_scales.get(
|
892
|
+
col,
|
893
|
+
get_numeric_precision_scale(self.flavor)
|
876
894
|
)
|
877
|
-
|
878
895
|
df[col] = df[col].apply(
|
879
896
|
functools.partial(
|
880
897
|
serialize_decimal,
|
meerschaum/core/Pipe/__init__.py
CHANGED
@@ -92,6 +92,7 @@ class Pipe:
|
|
92
92
|
_get_data_as_iterator,
|
93
93
|
get_chunk_interval,
|
94
94
|
get_chunk_bounds,
|
95
|
+
get_chunk_bounds_batches,
|
95
96
|
parse_date_bounds,
|
96
97
|
)
|
97
98
|
from ._register import register
|
@@ -117,6 +118,7 @@ class Pipe:
|
|
117
118
|
id,
|
118
119
|
get_val_column,
|
119
120
|
parents,
|
121
|
+
parent,
|
120
122
|
children,
|
121
123
|
target,
|
122
124
|
_target_legacy,
|
@@ -590,7 +590,7 @@ def get_val_column(self, debug: bool = False) -> Union[str, None]:
|
|
590
590
|
|
591
591
|
|
592
592
|
@property
|
593
|
-
def parents(self) -> List[
|
593
|
+
def parents(self) -> List[mrsm.Pipe]:
|
594
594
|
"""
|
595
595
|
Return a list of `meerschaum.Pipe` objects to be designated as parents.
|
596
596
|
"""
|
@@ -617,7 +617,18 @@ def parents(self) -> List[meerschaum.Pipe]:
|
|
617
617
|
|
618
618
|
|
619
619
|
@property
|
620
|
-
def
|
620
|
+
def parent(self) -> Union[mrsm.Pipe, None]:
|
621
|
+
"""
|
622
|
+
Return the first pipe in `self.parents` or `None`.
|
623
|
+
"""
|
624
|
+
parents = self.parents
|
625
|
+
if not parents:
|
626
|
+
return None
|
627
|
+
return parents[0]
|
628
|
+
|
629
|
+
|
630
|
+
@property
|
631
|
+
def children(self) -> List[mrsm.Pipe]:
|
621
632
|
"""
|
622
633
|
Return a list of `meerschaum.Pipe` objects to be designated as children.
|
623
634
|
"""
|
meerschaum/core/Pipe/_data.py
CHANGED
@@ -544,11 +544,33 @@ def get_rowcount(
|
|
544
544
|
from meerschaum.utils.warnings import warn
|
545
545
|
from meerschaum.utils.venv import Venv
|
546
546
|
from meerschaum.connectors import get_connector_plugin
|
547
|
+
from meerschaum.utils.misc import filter_keywords
|
547
548
|
|
548
549
|
begin, end = self.parse_date_bounds(begin, end)
|
549
550
|
connector = self.instance_connector if not remote else self.connector
|
550
551
|
try:
|
551
552
|
with Venv(get_connector_plugin(connector)):
|
553
|
+
if not hasattr(connector, 'get_pipe_rowcount'):
|
554
|
+
warn(
|
555
|
+
f"Connectors of type '{connector.type}' "
|
556
|
+
"do not implement `get_pipe_rowcount()`.",
|
557
|
+
stack=False,
|
558
|
+
)
|
559
|
+
return 0
|
560
|
+
kwargs = filter_keywords(
|
561
|
+
connector.get_pipe_rowcount,
|
562
|
+
begin=begin,
|
563
|
+
end=end,
|
564
|
+
params=params,
|
565
|
+
remote=remote,
|
566
|
+
debug=debug,
|
567
|
+
)
|
568
|
+
if remote and 'remote' not in kwargs:
|
569
|
+
warn(
|
570
|
+
f"Connectors of type '{connector.type}' do not support remote rowcounts.",
|
571
|
+
stack=False,
|
572
|
+
)
|
573
|
+
return 0
|
552
574
|
rowcount = connector.get_pipe_rowcount(
|
553
575
|
self,
|
554
576
|
begin=begin,
|
@@ -651,12 +673,19 @@ def get_chunk_bounds(
|
|
651
673
|
A list of chunk bounds (datetimes or integers).
|
652
674
|
If unbounded, the first and last chunks will include `None`.
|
653
675
|
"""
|
676
|
+
from datetime import timedelta
|
677
|
+
from meerschaum.utils.dtypes import are_dtypes_equal
|
678
|
+
from meerschaum.utils.misc import interval_str
|
654
679
|
include_less_than_begin = not bounded and begin is None
|
655
680
|
include_greater_than_end = not bounded and end is None
|
656
681
|
if begin is None:
|
657
682
|
begin = self.get_sync_time(newest=False, debug=debug)
|
658
683
|
if end is None:
|
659
684
|
end = self.get_sync_time(newest=True, debug=debug)
|
685
|
+
if end is not None and hasattr(end, 'tzinfo'):
|
686
|
+
end += timedelta(minutes=1)
|
687
|
+
elif are_dtypes_equal(str(type(end)), 'int'):
|
688
|
+
end += 1
|
660
689
|
if begin is None and end is None:
|
661
690
|
return [(None, None)]
|
662
691
|
|
@@ -670,10 +699,17 @@ def get_chunk_bounds(
|
|
670
699
|
### Run `verify pipes --workers 1` to sync chunks in series.
|
671
700
|
chunk_bounds = []
|
672
701
|
begin_cursor = begin
|
702
|
+
num_chunks = 0
|
703
|
+
max_chunks = 1_000_000
|
673
704
|
while begin_cursor < end:
|
674
705
|
end_cursor = begin_cursor + chunk_interval
|
675
706
|
chunk_bounds.append((begin_cursor, end_cursor))
|
676
707
|
begin_cursor = end_cursor
|
708
|
+
num_chunks += 1
|
709
|
+
if num_chunks >= max_chunks:
|
710
|
+
raise ValueError(
|
711
|
+
f"Too many chunks of size '{interval_str(chunk_interval)}' between '{begin}' and '{end}'."
|
712
|
+
)
|
677
713
|
|
678
714
|
### The chunk interval might be too large.
|
679
715
|
if not chunk_bounds and end >= begin:
|
@@ -695,6 +731,55 @@ def get_chunk_bounds(
|
|
695
731
|
return chunk_bounds
|
696
732
|
|
697
733
|
|
734
|
+
def get_chunk_bounds_batches(
|
735
|
+
self,
|
736
|
+
chunk_bounds: List[Tuple[Union[datetime, int, None], Union[datetime, int, None]]],
|
737
|
+
batchsize: Optional[int] = None,
|
738
|
+
workers: Optional[int] = None,
|
739
|
+
debug: bool = False,
|
740
|
+
) -> List[
|
741
|
+
Tuple[
|
742
|
+
Tuple[
|
743
|
+
Union[datetime, int, None],
|
744
|
+
Union[datetime, int, None],
|
745
|
+
], ...
|
746
|
+
]
|
747
|
+
]:
|
748
|
+
"""
|
749
|
+
Return a list of tuples of chunk bounds of size `batchsize`.
|
750
|
+
|
751
|
+
Parameters
|
752
|
+
----------
|
753
|
+
chunk_bounds: List[Tuple[Union[datetime, int, None], Union[datetime, int, None]]]
|
754
|
+
A list of chunk_bounds (see `Pipe.get_chunk_bounds()`).
|
755
|
+
|
756
|
+
batchsize: Optional[int], default None
|
757
|
+
How many chunks to include in a batch. Defaults to `Pipe.get_num_workers()`.
|
758
|
+
|
759
|
+
workers: Optional[int], default None
|
760
|
+
If `batchsize` is `None`, use this as the desired number of workers.
|
761
|
+
Passed to `Pipe.get_num_workers()`.
|
762
|
+
|
763
|
+
Returns
|
764
|
+
-------
|
765
|
+
A list of tuples of chunk bound tuples.
|
766
|
+
"""
|
767
|
+
from meerschaum.utils.misc import iterate_chunks
|
768
|
+
|
769
|
+
if batchsize is None:
|
770
|
+
batchsize = self.get_num_workers(workers=workers)
|
771
|
+
|
772
|
+
return [
|
773
|
+
tuple(
|
774
|
+
_batch_chunk_bounds
|
775
|
+
for _batch_chunk_bounds in batch
|
776
|
+
if _batch_chunk_bounds is not None
|
777
|
+
)
|
778
|
+
for batch in iterate_chunks(chunk_bounds, batchsize)
|
779
|
+
if batch
|
780
|
+
]
|
781
|
+
|
782
|
+
|
698
783
|
def parse_date_bounds(self, *dt_vals: Union[datetime, int, None]) -> Union[
|
699
784
|
datetime,
|
700
785
|
int,
|
@@ -110,13 +110,12 @@ def deduplicate(
|
|
110
110
|
)
|
111
111
|
if bounded and end is None:
|
112
112
|
end = self.get_sync_time(newest=True, debug=debug)
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
)
|
113
|
+
if end is not None:
|
114
|
+
end += (
|
115
|
+
timedelta(minutes=1)
|
116
|
+
if hasattr(end, 'tzinfo')
|
117
|
+
else 1
|
118
|
+
)
|
120
119
|
|
121
120
|
chunk_bounds = self.get_chunk_bounds(
|
122
121
|
bounded=bounded,
|
@@ -129,7 +128,6 @@ def deduplicate(
|
|
129
128
|
indices = [col for col in self.columns.values() if col]
|
130
129
|
if not indices:
|
131
130
|
return False, "Cannot deduplicate without index columns."
|
132
|
-
dt_col = self.columns.get('datetime', None)
|
133
131
|
|
134
132
|
def process_chunk_bounds(bounds) -> Tuple[
|
135
133
|
Tuple[
|