PyPI - dcs-sdk - Versions diffs - 1.6.5__py3-none-any.whl - Mend

dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (159) hide show

data_diff/__init__.py +221 -0
data_diff/__main__.py +517 -0
data_diff/abcs/__init__.py +13 -0
data_diff/abcs/compiler.py +27 -0
data_diff/abcs/database_types.py +402 -0
data_diff/config.py +141 -0
data_diff/databases/__init__.py +38 -0
data_diff/databases/_connect.py +323 -0
data_diff/databases/base.py +1417 -0
data_diff/databases/bigquery.py +376 -0
data_diff/databases/clickhouse.py +217 -0
data_diff/databases/databricks.py +262 -0
data_diff/databases/duckdb.py +207 -0
data_diff/databases/mssql.py +343 -0
data_diff/databases/mysql.py +189 -0
data_diff/databases/oracle.py +238 -0
data_diff/databases/postgresql.py +293 -0
data_diff/databases/presto.py +222 -0
data_diff/databases/redis.py +93 -0
data_diff/databases/redshift.py +233 -0
data_diff/databases/snowflake.py +222 -0
data_diff/databases/sybase.py +720 -0
data_diff/databases/trino.py +73 -0
data_diff/databases/vertica.py +174 -0
data_diff/diff_tables.py +489 -0
data_diff/errors.py +17 -0
data_diff/format.py +369 -0
data_diff/hashdiff_tables.py +1026 -0
data_diff/info_tree.py +76 -0
data_diff/joindiff_tables.py +434 -0
data_diff/lexicographic_space.py +253 -0
data_diff/parse_time.py +88 -0
data_diff/py.typed +0 -0
data_diff/queries/__init__.py +13 -0
data_diff/queries/api.py +213 -0
data_diff/queries/ast_classes.py +811 -0
data_diff/queries/base.py +38 -0
data_diff/queries/extras.py +43 -0
data_diff/query_utils.py +70 -0
data_diff/schema.py +67 -0
data_diff/table_segment.py +583 -0
data_diff/thread_utils.py +112 -0
data_diff/utils.py +1022 -0
data_diff/version.py +15 -0
dcs_core/__init__.py +13 -0
dcs_core/__main__.py +17 -0
dcs_core/__version__.py +15 -0
dcs_core/cli/__init__.py +13 -0
dcs_core/cli/cli.py +165 -0
dcs_core/core/__init__.py +19 -0
dcs_core/core/common/__init__.py +13 -0
dcs_core/core/common/errors.py +50 -0
dcs_core/core/common/models/__init__.py +13 -0
dcs_core/core/common/models/configuration.py +284 -0
dcs_core/core/common/models/dashboard.py +24 -0
dcs_core/core/common/models/data_source_resource.py +75 -0
dcs_core/core/common/models/metric.py +160 -0
dcs_core/core/common/models/profile.py +75 -0
dcs_core/core/common/models/validation.py +216 -0
dcs_core/core/common/models/widget.py +44 -0
dcs_core/core/configuration/__init__.py +13 -0
dcs_core/core/configuration/config_loader.py +139 -0
dcs_core/core/configuration/configuration_parser.py +262 -0
dcs_core/core/configuration/configuration_parser_arc.py +328 -0
dcs_core/core/datasource/__init__.py +13 -0
dcs_core/core/datasource/base.py +62 -0
dcs_core/core/datasource/manager.py +112 -0
dcs_core/core/datasource/search_datasource.py +421 -0
dcs_core/core/datasource/sql_datasource.py +1094 -0
dcs_core/core/inspect.py +163 -0
dcs_core/core/logger/__init__.py +13 -0
dcs_core/core/logger/base.py +32 -0
dcs_core/core/logger/default_logger.py +94 -0
dcs_core/core/metric/__init__.py +13 -0
dcs_core/core/metric/base.py +220 -0
dcs_core/core/metric/combined_metric.py +98 -0
dcs_core/core/metric/custom_metric.py +34 -0
dcs_core/core/metric/manager.py +137 -0
dcs_core/core/metric/numeric_metric.py +403 -0
dcs_core/core/metric/reliability_metric.py +90 -0
dcs_core/core/profiling/__init__.py +13 -0
dcs_core/core/profiling/datasource_profiling.py +136 -0
dcs_core/core/profiling/numeric_field_profiling.py +72 -0
dcs_core/core/profiling/text_field_profiling.py +67 -0
dcs_core/core/repository/__init__.py +13 -0
dcs_core/core/repository/metric_repository.py +77 -0
dcs_core/core/utils/__init__.py +13 -0
dcs_core/core/utils/log.py +29 -0
dcs_core/core/utils/tracking.py +105 -0
dcs_core/core/utils/utils.py +44 -0
dcs_core/core/validation/__init__.py +13 -0
dcs_core/core/validation/base.py +230 -0
dcs_core/core/validation/completeness_validation.py +153 -0
dcs_core/core/validation/custom_query_validation.py +24 -0
dcs_core/core/validation/manager.py +282 -0
dcs_core/core/validation/numeric_validation.py +276 -0
dcs_core/core/validation/reliability_validation.py +91 -0
dcs_core/core/validation/uniqueness_validation.py +61 -0
dcs_core/core/validation/validity_validation.py +738 -0
dcs_core/integrations/__init__.py +13 -0
dcs_core/integrations/databases/__init__.py +13 -0
dcs_core/integrations/databases/bigquery.py +187 -0
dcs_core/integrations/databases/databricks.py +51 -0
dcs_core/integrations/databases/db2.py +652 -0
dcs_core/integrations/databases/elasticsearch.py +61 -0
dcs_core/integrations/databases/mssql.py +829 -0
dcs_core/integrations/databases/mysql.py +409 -0
dcs_core/integrations/databases/opensearch.py +64 -0
dcs_core/integrations/databases/oracle.py +719 -0
dcs_core/integrations/databases/postgres.py +482 -0
dcs_core/integrations/databases/redshift.py +53 -0
dcs_core/integrations/databases/snowflake.py +48 -0
dcs_core/integrations/databases/spark_df.py +111 -0
dcs_core/integrations/databases/sybase.py +1069 -0
dcs_core/integrations/storage/__init__.py +13 -0
dcs_core/integrations/storage/local_file.py +149 -0
dcs_core/integrations/utils/__init__.py +13 -0
dcs_core/integrations/utils/utils.py +36 -0
dcs_core/report/__init__.py +13 -0
dcs_core/report/dashboard.py +211 -0
dcs_core/report/models.py +88 -0
dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
dcs_core/report/static/assets/images/docs.svg +6 -0
dcs_core/report/static/assets/images/github.svg +4 -0
dcs_core/report/static/assets/images/logo.svg +7 -0
dcs_core/report/static/assets/images/slack.svg +13 -0
dcs_core/report/static/index.js +2 -0
dcs_core/report/static/index.js.LICENSE.txt +3971 -0
dcs_sdk/__init__.py +13 -0
dcs_sdk/__main__.py +18 -0
dcs_sdk/__version__.py +15 -0
dcs_sdk/cli/__init__.py +13 -0
dcs_sdk/cli/cli.py +163 -0
dcs_sdk/sdk/__init__.py +58 -0
dcs_sdk/sdk/config/__init__.py +13 -0
dcs_sdk/sdk/config/config_loader.py +491 -0
dcs_sdk/sdk/data_diff/__init__.py +13 -0
dcs_sdk/sdk/data_diff/data_differ.py +821 -0
dcs_sdk/sdk/rules/__init__.py +15 -0
dcs_sdk/sdk/rules/rules_mappping.py +31 -0
dcs_sdk/sdk/rules/rules_repository.py +214 -0
dcs_sdk/sdk/rules/schema_rules.py +65 -0
dcs_sdk/sdk/utils/__init__.py +13 -0
dcs_sdk/sdk/utils/serializer.py +25 -0
dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
dcs_sdk/sdk/utils/table.py +475 -0
dcs_sdk/sdk/utils/themes.py +40 -0
dcs_sdk/sdk/utils/utils.py +349 -0
dcs_sdk-1.6.5.dist-info/METADATA +150 -0
dcs_sdk-1.6.5.dist-info/RECORD +159 -0
dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0

data_diff/table_segment.py ADDED Viewed

@@ -0,0 +1,583 @@
+#  Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import logging
+import time
+from decimal import Decimal
+from itertools import product
+from typing import Container, Dict, List, Optional, Sequence, Tuple
+import attrs
+import numpy as np
+from loguru import logger
+from typing_extensions import Self
+from data_diff.abcs.database_types import DbKey, DbPath, DbTime, IKey, NumericType
+from data_diff.databases.base import Database
+from data_diff.databases.redis import RedisBackend
+from data_diff.queries.api import (
+    SKIP,
+    Code,
+    Count,
+    Expr,
+    and_,
+    max_,
+    min_,
+    or_,
+    table,
+    this,
+)
+from data_diff.queries.extras import (
+    ApplyFuncAndNormalizeAsString,
+    Checksum,
+    NormalizeAsString,
+)
+from data_diff.schema import RawColumnInfo, Schema, create_schema
+from data_diff.utils import (
+    ArithDate,
+    ArithDateTime,
+    ArithString,
+    ArithTimestamp,
+    ArithTimestampTZ,
+    ArithUnicodeString,
+    JobCancelledError,
+    Vector,
+    safezip,
+    split_space,
+)
+# logger = logging.getLogger("table_segment")
+RECOMMENDED_CHECKSUM_DURATION = 20
+def split_key_space(min_key: DbKey, max_key: DbKey, count: int) -> List[DbKey]:
+    assert min_key < max_key
+    if max_key - min_key <= count:
+        count = 1
+    # Handle arithmetic string types (including temporal types)
+    if isinstance(
+        min_key, (ArithString, ArithUnicodeString, ArithDateTime, ArithDate, ArithTimestamp, ArithTimestampTZ)
+    ):
+        assert type(min_key) is type(max_key)
+        checkpoints = min_key.range(max_key, count)
+    else:
+        # Handle numeric types
+        if isinstance(min_key, Decimal):
+            min_key = float(min_key)
+        if isinstance(max_key, Decimal):
+            max_key = float(max_key)
+        checkpoints = split_space(min_key, max_key, count)
+    assert all(min_key < x < max_key for x in checkpoints)
+    return [min_key] + checkpoints + [max_key]
+def int_product(nums: List[int]) -> int:
+    p = 1
+    for n in nums:
+        p *= n
+    return p
+def split_compound_key_space(mn: Vector, mx: Vector, count: int) -> List[List[DbKey]]:
+    """Returns a list of split-points for each key dimension, essentially returning an N-dimensional grid of split points."""
+    return [split_key_space(mn_k, mx_k, count) for mn_k, mx_k in safezip(mn, mx)]
+def create_mesh_from_points(*values_per_dim: list) -> List[Tuple[Vector, Vector]]:
+    """Given a list of values along each axis of N dimensional space,
+    return an array of boxes whose start-points & end-points align with the given values,
+    and together consitute a mesh filling that space entirely (within the bounds of the given values).
+    Assumes given values are already ordered ascending.
+    len(boxes) == ∏i( len(i)-1 )
+    Example:
+        ::
+            >>> d1 = 'a', 'b', 'c'
+            >>> d2 = 1, 2, 3
+            >>> d3 = 'X', 'Y'
+            >>> create_mesh_from_points(d1, d2, d3)
+            [
+                [('a', 1, 'X'), ('b', 2, 'Y')],
+                [('a', 2, 'X'), ('b', 3, 'Y')],
+                [('b', 1, 'X'), ('c', 2, 'Y')],
+                [('b', 2, 'X'), ('c', 3, 'Y')]
+            ]
+    """
+    assert all(len(v) >= 2 for v in values_per_dim), values_per_dim
+    # Create tuples of (v1, v2) for each pair of adjacent values
+    ranges = [list(zip(values[:-1], values[1:])) for values in values_per_dim]
+    assert all(a <= b for r in ranges for a, b in r)
+    # Create a product of all the ranges
+    res = [tuple(Vector(a) for a in safezip(*r)) for r in product(*ranges)]
+    expected_len = int_product(len(v) - 1 for v in values_per_dim)
+    assert len(res) == expected_len, (len(res), expected_len)
+    return res
+@attrs.define(frozen=True)
+class TableSegment:
+    """Signifies a segment of rows (and selected columns) within a table
+    Parameters:
+        database (Database): Database instance. See :meth:`connect`
+        table_path (:data:`DbPath`): Path to table in form of a tuple. e.g. `('my_dataset', 'table_name')`
+        key_columns (Tuple[str]): Name of the key column, which uniquely identifies each row (usually id)
+        update_column (str, optional): Name of updated column, which signals that rows changed.
+                                       Usually updated_at or last_update. Used by `min_update` and `max_update`.
+        extra_columns (Tuple[str, ...], optional): Extra columns to compare
+        transform_columns (Dict[str, str], optional): A dictionary mapping column names to SQL transformation expressions.
+                                                      These expressions are applied directly to the specified columns within the
+                                                      comparison query, *before* the data is hashed or compared. Useful for
+                                                      on-the-fly normalization (e.g., type casting, timezone conversions) without
+                                                      requiring intermediate views or staging tables. Defaults to an empty dict.
+        min_key (:data:`Vector`, optional): Lowest key value, used to restrict the segment
+        max_key (:data:`Vector`, optional): Highest key value, used to restrict the segment
+        min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
+        max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
+        where (str, optional): An additional 'where' expression to restrict the search space.
+        case_sensitive (bool): If false, the case of column names will adjust according to the schema. Default is true.
+    """
+    # Location of table
+    database: Database
+    table_path: DbPath
+    # Columns
+    key_columns: Tuple[str, ...]
+    update_column: Optional[str] = None
+    extra_columns: Tuple[str, ...] = ()
+    transform_columns: Dict[str, str] = {}
+    ignored_columns: Container[str] = frozenset()
+    # Restrict the segment
+    min_key: Optional[Vector] = None
+    max_key: Optional[Vector] = None
+    min_update: Optional[DbTime] = None
+    max_update: Optional[DbTime] = None
+    where: Optional[str] = None
+    case_sensitive: Optional[bool] = True
+    _schema: Optional[Schema] = None
+    query_stats: Dict = attrs.Factory(
+        lambda: {
+            "count_queries_stats": {
+                "total_queries": 0,
+                "min_time_ms": 0.0,
+                "max_time_ms": 0.0,
+                "avg_time_ms": 0.0,
+                "p90_time_ms": 0.0,
+                "total_time_taken_ms": 0.0,
+                "_query_times": [],
+            },
+            "checksum_queries_stats": {
+                "total_queries": 0,
+                "min_time_ms": 0.0,
+                "max_time_ms": 0.0,
+                "avg_time_ms": 0.0,
+                "p90_time_ms": 0.0,
+                "total_time_taken_ms": 0.0,
+                "_query_times": [],
+            },
+            "row_fetch_queries_stats": {
+                "total_queries": 0,
+                "min_time_ms": 0.0,
+                "max_time_ms": 0.0,
+                "avg_time_ms": 0.0,
+                "p90_time_ms": 0.0,
+                "total_time_taken_ms": 0.0,
+                "_query_times": [],
+            },
+            "schema_queries_stats": {
+                "total_queries": 0,
+                "min_time_ms": 0.0,
+                "max_time_ms": 0.0,
+                "avg_time_ms": 0.0,
+                "p90_time_ms": 0.0,
+                "total_time_taken_ms": 0.0,
+                "_query_times": [],
+            },
+        }
+    )
+    job_id: Optional[int] = None
+    def __attrs_post_init__(self) -> None:
+        if not self.update_column and (self.min_update or self.max_update):
+            raise ValueError("Error: the min_update/max_update feature requires 'update_column' to be set.")
+        if self.min_key is not None and self.max_key is not None and self.min_key >= self.max_key:
+            raise ValueError(f"Error: min_key expected to be smaller than max_key! ({self.min_key} >= {self.max_key})")
+        if self.min_update is not None and self.max_update is not None and self.min_update >= self.max_update:
+            raise ValueError(
+                f"Error: min_update expected to be smaller than max_update! ({self.min_update} >= {self.max_update})"
+            )
+    def _update_stats(self, stats_key: str, query_time_ms: float) -> None:
+        logger.info(f"Query time for {stats_key.replace('_stats', '')}: {query_time_ms:.2f} ms")
+        stats = self.query_stats[stats_key]
+        stats["total_queries"] += 1
+        stats["_query_times"].append(query_time_ms)
+        stats["total_time_taken_ms"] += query_time_ms
+        stats["min_time_ms"] = min(stats["_query_times"]) if stats["_query_times"] else 0.0
+        stats["max_time_ms"] = max(stats["_query_times"]) if stats["_query_times"] else 0.0
+        if stats["_query_times"]:
+            times = np.array(stats["_query_times"])
+            stats["avg_time_ms"] = float(np.mean(times))
+            stats["p90_time_ms"] = float(np.percentile(times, 90, method="linear"))
+        else:
+            stats["avg_time_ms"] = 0.0
+            stats["p90_time_ms"] = 0.0
+    def _where(self) -> Optional[str]:
+        return f"({self.where})" if self.where else None
+    def _column_expr(self, column_name: str) -> Expr:
+        """Return expression for a column, applying configured SQL transform if present."""
+        quoted_column_name = self.database.quote(s=column_name)
+        if self.transform_columns and column_name in self.transform_columns:
+            transform_expr = self.transform_columns[column_name]
+            quoted_expr = transform_expr.format(column=quoted_column_name)
+            return Code(quoted_expr)
+        return this[column_name]
+    def _with_raw_schema(self, raw_schema: Dict[str, RawColumnInfo]) -> Self:
+        schema = self.database._process_table_schema(self.table_path, raw_schema, self.relevant_columns, self._where())
+        # return self.new(schema=create_schema(self.database.name, self.table_path, schema, self.case_sensitive))
+        return self.new(
+            schema=create_schema(self.database.name, self.table_path, schema, self.case_sensitive),
+            transform_columns=self.transform_columns,
+        )
+    def with_schema(self) -> Self:
+        "Queries the table schema from the database, and returns a new instance of TableSegment, with a schema."
+        if self._schema:
+            return self
+        start_time = time.monotonic()
+        raw_schema = self.database.query_table_schema(self.table_path)
+        query_time_ms = (time.monotonic() - start_time) * 1000
+        self._update_stats("schema_queries_stats", query_time_ms)
+        return self._with_raw_schema(raw_schema)
+    def get_schema(self) -> Dict[str, RawColumnInfo]:
+        return self.database.query_table_schema(self.table_path)
+    def _make_key_range(self):
+        if self.min_key is not None:
+            for mn, k in safezip(self.min_key, self.key_columns):
+                quoted = self.database.dialect.quote(k)
+                base_expr_sql = (
+                    self.transform_columns[k].format(column=quoted)
+                    if self.transform_columns and k in self.transform_columns
+                    else quoted
+                )
+                constant_val = self.database.dialect._constant_value(mn)
+                yield Code(f"{base_expr_sql} >= {constant_val}")
+        if self.max_key is not None:
+            for k, mx in safezip(self.key_columns, self.max_key):
+                quoted = self.database.dialect.quote(k)
+                base_expr_sql = (
+                    self.transform_columns[k].format(column=quoted)
+                    if self.transform_columns and k in self.transform_columns
+                    else quoted
+                )
+                constant_val = self.database.dialect._constant_value(mx)
+                yield Code(f"{base_expr_sql} < {constant_val}")
+    def _make_update_range(self):
+        if self.min_update is not None:
+            yield self.min_update <= this[self.update_column]
+        if self.max_update is not None:
+            yield this[self.update_column] < self.max_update
+    @property
+    def source_table(self):
+        return table(*self.table_path, schema=self._schema)
+    def make_select(self):
+        return self.source_table.where(
+            *self._make_key_range(), *self._make_update_range(), Code(self._where()) if self.where else SKIP
+        )
+    def get_values(self) -> list:
+        "Download all the relevant values of the segment from the database"
+        # Fetch all the original columns, even if some were later excluded from checking.
+        # fetched_cols = [NormalizeAsString(this[c]) for c in self.relevant_columns]
+        # select = self.make_select().select(*fetched_cols)
+        if self._is_cancelled():
+            raise JobCancelledError(self.job_id)
+        select = self.make_select().select(*self._relevant_columns_repr)
+        start_time = time.monotonic()
+        result = self.database.query(select, List[Tuple])
+        query_time_ms = (time.monotonic() - start_time) * 1000
+        self._update_stats("row_fetch_queries_stats", query_time_ms)
+        return result
+    # def get_sample_data(self, limit: int = 100) -> list:
+    #     "Download all the relevant values of the segment from the database"
+    #     exprs = []
+    #     for c in self.key_columns:
+    #         quoted = self.database.dialect.quote(c)
+    #         exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
+    #     if self.where:
+    #         select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
+    #         self.key_columns
+    #     else:
+    #         select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
+    #     start_time = time.monotonic()
+    #     result = self.database.query(select, List[Tuple])
+    #     query_time_ms = (time.monotonic() - start_time) * 1000
+    #     self._update_stats("row_fetch_queries_stats", query_time_ms)
+    def get_sample_data(self, limit: int = 100, sample_keys: Optional[List[List[DbKey]]] = None) -> list:
+        """
+        Download relevant values of the segment from the database.
+        If `sample_keys` is provided, it filters rows matching those composite keys.
+        Parameters:
+            limit (int): Maximum number of rows to return (default: 100).
+            sample_keys (Optional[List[List[DbKey]]]): List of composite keys to filter rows.
+                Each inner list must match the number of key_columns.
+        Returns:
+            list: List of tuples containing the queried row data.
+        """
+        if self._is_cancelled():
+            raise JobCancelledError(self.job_id)
+        select = self.make_select().select(*self._relevant_columns_repr)
+        filters = []
+        if sample_keys:
+            key_exprs = []
+            for key_values in sample_keys:
+                and_exprs = []
+                for col, val in safezip(self.key_columns, key_values):
+                    quoted = self.database.dialect.quote(col)
+                    base_expr_sql = (
+                        self.transform_columns[col].format(column=quoted)
+                        if self.transform_columns and col in self.transform_columns
+                        else quoted
+                    )
+                    schema = self._schema[col]
+                    if val is None:
+                        and_exprs.append(Code(base_expr_sql + " IS NULL"))
+                        continue
+                    mk_v = schema.make_value(val)
+                    constant_val = self.database.dialect._constant_value(mk_v)
+                    # Special handling for Sybase timestamp equality to handle precision mismatches
+                    if hasattr(self.database.dialect, "timestamp_equality_condition") and hasattr(
+                        mk_v, "_dt"
+                    ):  # Check if it's a datetime-like object
+                        where_expr = self.database.dialect.timestamp_equality_condition(base_expr_sql, constant_val)
+                    else:
+                        where_expr = f"{base_expr_sql} = {constant_val}"
+                    and_exprs.append(Code(where_expr))
+                if and_exprs:
+                    key_exprs.append(and_(*and_exprs))
+            if key_exprs:
+                filters.append(or_(*key_exprs))
+        if filters or self.where:
+            select = select.where(*filters)
+        else:
+            logger.warning("No filters applied; fetching up to {} rows without key restrictions", limit)
+        select = select.limit(limit)
+        start_time = time.monotonic()
+        result = self.database.query(select, List[Tuple])
+        query_time_ms = (time.monotonic() - start_time) * 1000
+        self._update_stats("row_fetch_queries_stats", query_time_ms)
+        return result
+    def choose_checkpoints(self, count: int) -> List[List[DbKey]]:
+        "Suggests a bunch of evenly-spaced checkpoints to split by, including start, end."
+        assert self.is_bounded
+        # Take Nth root of count, to approximate the appropriate box size
+        count = int(count ** (1 / len(self.key_columns))) or 1
+        return split_compound_key_space(self.min_key, self.max_key, count)
+    def segment_by_checkpoints(self, checkpoints: List[List[DbKey]]) -> List["TableSegment"]:
+        "Split the current TableSegment to a bunch of smaller ones, separated by the given checkpoints"
+        return [self.new_key_bounds(min_key=s, max_key=e) for s, e in create_mesh_from_points(*checkpoints)]
+    def new(self, **kwargs) -> Self:
+        """Creates a copy of the instance using 'replace()'"""
+        return attrs.evolve(self, **kwargs)
+    def new_key_bounds(self, min_key: Vector, max_key: Vector, *, key_types: Optional[Sequence[IKey]] = None) -> Self:
+        if self.min_key is not None:
+            assert self.min_key <= min_key, (self.min_key, min_key)
+            assert self.min_key < max_key
+        if self.max_key is not None:
+            assert min_key < self.max_key
+            assert max_key <= self.max_key
+        # If asked, enforce the PKs to proper types, mainly to meta-params of the relevant side,
+        # so that we do not leak e.g. casing of UUIDs from side A to side B and vice versa.
+        # If not asked, keep the meta-params of the keys as is (assume them already casted).
+        if key_types is not None:
+            min_key = Vector(type.make_value(val) for type, val in safezip(key_types, min_key))
+            max_key = Vector(type.make_value(val) for type, val in safezip(key_types, max_key))
+        return attrs.evolve(self, min_key=min_key, max_key=max_key)
+    @property
+    def relevant_columns(self) -> List[str]:
+        extras = list(self.extra_columns)
+        if self.update_column and self.update_column not in extras:
+            extras = [self.update_column] + extras
+        return list(self.key_columns) + extras
+    @property
+    def _relevant_columns_repr(self) -> List[Expr]:
+        # return [NormalizeAsString(this[c]) for c in self.relevant_columns]
+        expressions = []
+        for c in self.relevant_columns:
+            schema = self._schema[c]
+            expressions.append(NormalizeAsString(self._column_expr(c), schema))
+        return expressions
+    def count(self) -> int:
+        """Count how many rows are in the segment, in one pass."""
+        if self._is_cancelled():
+            raise JobCancelledError(self.job_id)
+        start_time = time.monotonic()
+        result = self.database.query(self.make_select().select(Count()), int)
+        query_time_ms = (time.monotonic() - start_time) * 1000
+        self._update_stats("count_queries_stats", query_time_ms)
+        return result
+    def count_and_checksum(self) -> Tuple[int, int]:
+        """Count and checksum the rows in the segment, in one pass."""
+        if self._is_cancelled():
+            raise JobCancelledError(self.job_id)
+        checked_columns = [c for c in self.relevant_columns if c not in self.ignored_columns]
+        # Build transformed expressions for checksum, honoring transforms and normalization
+        checksum_exprs: List[Expr] = []
+        for c in checked_columns:
+            schema = self._schema[c]
+            checksum_exprs.append(NormalizeAsString(self._column_expr(c), schema))
+        q = self.make_select().select(Count(), Checksum(checksum_exprs))
+        start_time = time.monotonic()
+        count, checksum = self.database.query(q, tuple)
+        query_time_ms = (time.monotonic() - start_time) * 1000
+        self._update_stats("checksum_queries_stats", query_time_ms)
+        duration = query_time_ms / 1000
+        if duration > RECOMMENDED_CHECKSUM_DURATION:
+            logger.warning(
+                "Checksum is taking longer than expected (%.2f). "
+                "We recommend increasing --bisection-factor or decreasing --threads.",
+                duration,
+            )
+        if count:
+            assert checksum, (count, checksum)
+        return count or 0, int(checksum) if count else None
+    def query_key_range(self) -> Tuple[tuple, tuple]:
+        """Query database for minimum and maximum key. This is used for setting the initial bounds."""
+        # Normalizes the result (needed for UUIDs) after the min/max computation
+        select = self.make_select().select(
+            ApplyFuncAndNormalizeAsString(self._column_expr(k), f) for k in self.key_columns for f in (min_, max_)
+        )
+        result = tuple(self.database.query(select, tuple))
+        if any(i is None for i in result):
+            raise ValueError("Table appears to be empty")
+        # Min/max keys are interleaved
+        min_key, max_key = result[::2], result[1::2]
+        assert len(min_key) == len(max_key)
+        return min_key, max_key
+    @property
+    def is_bounded(self):
+        return self.min_key is not None and self.max_key is not None
+    # def approximate_size(self, row_count: Optional[int] = None):
+    #     if not self.is_bounded:
+    #         raise RuntimeError("Cannot approximate the size of an unbounded segment. Must have min_key and max_key.")
+    #     diff = self.max_key - self.min_key
+    #     assert all(d > 0 for d in diff)
+    #     return int_product(diff)
+    def approximate_size(self, row_count: Optional[int] = None) -> int:
+        if not self.is_bounded:
+            raise RuntimeError("Cannot approximate the size of an unbounded segment. Must have min_key and max_key.")
+        schema = self.get_schema()
+        key_types = [schema[col].__class__ for col in self.key_columns]
+        if all(issubclass(t, NumericType) for t in key_types):
+            try:
+                diff = [mx - mn for mn, mx in zip(self.min_key, self.max_key)]
+                if not all(d > 0 for d in diff):
+                    return row_count if row_count is not None else self.count()
+                return int_product(diff)
+            except (ValueError, TypeError):
+                return row_count if row_count is not None else self.count()
+        else:
+            return row_count if row_count is not None else self.count()
+    def _is_cancelled(self) -> bool:
+        run_id = self.job_id
+        if not run_id:
+            return False
+        run_id = f"revoke_job:{run_id}"
+        try:
+            backend = RedisBackend.get_instance()
+            val = backend.client.get(run_id)
+            if not val:
+                return False
+            if isinstance(val, bytes):
+                try:
+                    val = val.decode()
+                except Exception:
+                    val = str(val)
+            return isinstance(val, str) and val.strip().lower() == "revoke"
+        except Exception:
+            logger.warning("Unable to query Redis for cancellation for run_id=%s", run_id)
+            return False

data_diff/thread_utils.py ADDED Viewed

@@ -0,0 +1,112 @@
+#  Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import itertools
+from collections import deque
+from collections.abc import Iterable
+from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures.thread import _WorkItem
+from queue import PriorityQueue
+from time import sleep
+from typing import Any, Callable, Iterator, Optional
+import attrs
+class AutoPriorityQueue(PriorityQueue):
+    """Overrides PriorityQueue to automatically get the priority from _WorkItem.kwargs
+    We also assign a unique id for each item, to avoid making comparisons on _WorkItem.
+    As a side effect, items with the same priority are returned FIFO.
+    """
+    _counter = itertools.count().__next__
+    def put(self, item: Optional[_WorkItem], block=True, timeout=None) -> None:
+        priority = item.kwargs.pop("priority") if item is not None else 0
+        super().put((-priority, self._counter(), item), block, timeout)
+    def get(self, block=True, timeout=None) -> Optional[_WorkItem]:
+        _p, _c, work_item = super().get(block, timeout)
+        return work_item
+class PriorityThreadPoolExecutor(ThreadPoolExecutor):
+    """Overrides ThreadPoolExecutor to use AutoPriorityQueue
+    XXX WARNING: Might break in future versions of Python
+    """
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+        self._work_queue = AutoPriorityQueue()
+@attrs.define(frozen=False, init=False)
+class ThreadedYielder(Iterable):
+    """Yields results from multiple threads into a single iterator, ordered by priority.
+    To add a source iterator, call ``submit()`` with a function that returns an iterator.
+    Priority for the iterator can be provided via the keyword argument 'priority'. (higher runs first)
+    """
+    _pool: ThreadPoolExecutor
+    _futures: deque
+    _yield: deque
+    _exception: Optional[None]
+    _pool: ThreadPoolExecutor
+    _futures: deque
+    _yield: deque = attrs.field(alias="_yield")  # Python keyword!
+    _exception: Optional[None]
+    yield_list: bool
+    def __init__(self, max_workers: Optional[int] = None, yield_list: bool = False) -> None:
+        super().__init__()
+        self._pool = PriorityThreadPoolExecutor(max_workers)
+        self._futures = deque()
+        self._yield = deque()
+        self._exception = None
+        self.yield_list = yield_list
+    def _worker(self, fn, *args, **kwargs) -> None:
+        try:
+            res = fn(*args, **kwargs)
+            if res is not None:
+                if self.yield_list:
+                    self._yield.append(res)
+                else:
+                    self._yield += res
+        except Exception as e:
+            self._exception = e
+    def submit(self, fn: Callable, *args, priority: int = 0, **kwargs) -> None:
+        self._futures.append(self._pool.submit(self._worker, fn, *args, priority=priority, **kwargs))
+    def __iter__(self) -> Iterator[Any]:
+        while True:
+            if self._exception:
+                raise self._exception
+            while self._yield:
+                yield self._yield.popleft()
+            if not self._futures:
+                # No more tasks
+                return
+            if self._futures[0].done():
+                self._futures.popleft()
+            else:
+                sleep(0.001)