PyPI - dcs-sdk - Versions diffs - 1.6.5__py3-none-any.whl → 1.6.7__py3-none-any.whl - Mend

dcs-sdk 1.6.5py3-none-any.whl → 1.6.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data_diff/__init__.py +0 -2
dcs_core/core/common/errors.py +18 -0
dcs_core/core/common/models/configuration.py +6 -0
dcs_core/core/datasource/file_datasource.py +26 -0
dcs_core/core/datasource/manager.py +15 -0
dcs_core/integrations/databases/azure_blob.py +115 -0
dcs_core/integrations/databases/mssql.py +156 -6
dcs_core/integrations/databases/postgres.py +90 -2
dcs_sdk/__version__.py +1 -1
dcs_sdk/sdk/config/config_loader.py +13 -0
dcs_sdk/sdk/data_diff/data_differ.py +59 -12
dcs_sdk/sdk/utils/utils.py +136 -1
{dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.7.dist-info}/METADATA +4 -2
{dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.7.dist-info}/RECORD +16 -14
{dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.7.dist-info}/WHEEL +0 -0
{dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.7.dist-info}/entry_points.txt +0 -0

data_diff/__init__.py CHANGED Viewed

@@ -55,9 +55,7 @@ def connect_to_table(
             db_info.pop(k)
     if isinstance(key_columns, str):
         key_columns = (key_columns,)
     db: Database = connect(db_info, thread_count=thread_count)
     if isinstance(table_name, str):
         table_name = db.dialect.parse_table_name(table_name)

dcs_core/core/common/errors.py CHANGED Viewed

@@ -16,6 +16,8 @@ ERROR_RUNTIME = "runtime_error"
 ERROR_CONFIGURATION = "configuration_error"
 ERROR_DATA_SOURCES_CONNECTION = "data_sources_connection_error"
 ERROR_METRIC_GENERATION = "metric_generation_error"
+ERROR_FETCHING_TABLE = "table_fetch_error"
+ERROR_FETCHING_COLUMN = "column_fetch_error"
 class DataChecksRuntimeError(Exception):
@@ -48,3 +50,19 @@ class DataChecksMetricGenerationError(Exception):
     def __init__(self, message):
         super().__init__(message)
         self.error_code = ERROR_METRIC_GENERATION
+class DatachecksTableFetchError(Exception):
+    """Raised when there is an error in fetching table."""
+    def __init__(self, message):
+        super().__init__(message)
+        self.error_code = ERROR_FETCHING_TABLE
+class DatachecksColumnFetchError(Exception):
+    """Raised when there is an error in fetching column."""
+    def __init__(self, message):
+        super().__init__(message)
+        self.error_code = ERROR_FETCHING_COLUMN

dcs_core/core/common/models/configuration.py CHANGED Viewed

@@ -43,6 +43,7 @@ class DataSourceType(str, Enum):
     ORACLE = "oracle"
     DB2 = "db2"
     SYBASE = "sybase"
+    AZURE_BLOB = "azure_blob"
 class DataSourceLanguageSupport(str, Enum):
@@ -85,6 +86,11 @@ class DataSourceConnectionConfiguration:
     security: Optional[str] = None  # IBM DB2 specific configuration
     protocol: Optional[str] = None  # IBM DB2 specific configuration
     server: Optional[str] = None
+    account_name: Optional[str] = None
+    container_name: Optional[str] = None
+    account_key: Optional[str] = None
+    endpoint_suffix: Optional[str] = None
+    subfolder_path: Optional[str] = None
 @dataclass

dcs_core/core/datasource/file_datasource.py ADDED Viewed

@@ -0,0 +1,26 @@
+#  Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from typing import Dict
+from dcs_core.core.datasource.base import DataSource
+class FileDataSource(DataSource):
+    """
+    Abstract class for File data sources
+    """
+    def __init__(self, data_source_name: str, data_connection: Dict):
+        super().__init__(data_source_name, data_connection)

dcs_core/core/datasource/manager.py CHANGED Viewed

@@ -11,6 +11,20 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+#  Copyright 2022-present, the Waterdip Labs Pvt. Ltd.manager
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
 import importlib
 from dataclasses import asdict
 from typing import Dict, List
@@ -43,6 +57,7 @@ class DataSourceManager:
         "oracle": "OracleDataSource",
         "db2": "DB2DataSource",
         "sybase": "SybaseDataSource",
+        "azure_blob": "AzureBlobDataSource",
     }
     def __init__(self, config: Configuration):

dcs_core/integrations/databases/azure_blob.py ADDED Viewed

@@ -0,0 +1,115 @@
+#  Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import io
+from typing import Any, Dict, List, Optional
+import pandas as pd
+from azure.storage.blob import BlobServiceClient
+from dcs_core.core.common.errors import (
+    DatachecksColumnFetchError,
+    DataChecksDataSourcesConnectionError,
+    DatachecksTableFetchError,
+)
+from dcs_core.core.datasource.file_datasource import FileDataSource
+class AzureBlobDataSource(FileDataSource):
+    def __init__(self, data_source_name: str, data_connection: Dict):
+        super().__init__(data_source_name, data_connection)
+        self.allowed_file_extensions = [".csv"]
+        self.blob_service_client: Optional[BlobServiceClient] = None
+        self.connection = None
+    def connect(self) -> Any:
+        """
+        Connect to the file data source
+        """
+        try:
+            account_name = self.data_connection.get("account_name")
+            container_name = self.data_connection.get("container_name")
+            account_key = self.data_connection.get("account_key")
+            endpoint_suffix = self.data_connection.get("endpoint_suffix", "core.windows.net")
+            connection_str = f"https://{account_name}.blob.{endpoint_suffix}"
+            blob_service_client = BlobServiceClient(account_url=connection_str, credential=account_key)
+            self.blob_service_client = blob_service_client
+            self.connection = blob_service_client.get_container_client(container=container_name)
+            return self.connection
+        except Exception as e:
+            raise DataChecksDataSourcesConnectionError(f"Failed to connect to Azure Blob Storage: {e}")
+    def is_connected(self) -> bool:
+        """
+        Check if the file data source is connected
+        """
+        return self.connection is not None
+    def close(self):
+        """
+        Close the connection
+        """
+        self.connection.close()
+        self.blob_service_client.close()
+        self.connection = None
+        self.blob_service_client = None
+    def query_get_table_names(self) -> dict:
+        """
+        Query to get table names (blob names in this case)
+        """
+        if not self.is_connected():
+            raise DataChecksDataSourcesConnectionError("Not connected to Azure Blob Storage")
+        try:
+            subfolder = self.data_connection.get("subfolder", "")
+            blob_iterator = self.connection.list_blobs(name_starts_with=subfolder)
+            blobs = [
+                blob.name
+                for blob in blob_iterator
+                if len(blob.name.split("/")) == 1 and blob.name.endswith(tuple(self.allowed_file_extensions))
+            ]
+            return {"table": blobs}
+        except Exception as e:
+            raise DatachecksTableFetchError(f"Failed to list blobs: {e}")
+    def query_get_table_columns(self, table: str) -> List[dict]:
+        """
+        Get column names for a table (CSV blob in this case).
+        """
+        if not self.is_connected():
+            raise DataChecksDataSourcesConnectionError("Not connected to Azure Blob Storage")
+        if not any(table.endswith(ext) for ext in self.allowed_file_extensions):
+            raise ValueError(f"Unsupported file type for {table}. Allowed: {self.allowed_file_extensions}")
+        try:
+            blob_client = self.connection.get_blob_client(blob=table)
+            download_stream = blob_client.download_blob()
+            data = download_stream.readall()
+            if table.endswith(".csv"):
+                df = pd.read_csv(io.BytesIO(data))
+            else:
+                raise ValueError(f"Unsupported file type for {table}. Allowed: {self.allowed_file_extensions}")
+            return [{"column_name": col, "column_type": "string"} for col in df.columns.tolist()]
+        except Exception as e:
+            raise DatachecksColumnFetchError(f"Failed to read columns from blob '{table}': {e}")
+    def query_get_database_version(self) -> str:
+        """
+        Get the database version
+        :return: version string
+        """
+        api_version = self.blob_service_client.api_version
+        return api_version

dcs_core/integrations/databases/mssql.py CHANGED Viewed

@@ -13,6 +13,7 @@
 #  limitations under the License.
 import datetime
+import math
 from decimal import Decimal
 from typing import Any, Dict, List, Optional, Tuple, Union
 from uuid import UUID
@@ -706,13 +707,15 @@ class MssqlDataSource(SQLDataSource):
         cursor = self.connection.cursor()
         try:
             cursor.execute(query)
-            columns = [column[0] for column in cursor.description]
-            result_row = cursor.fetchone()
+            if cursor.description:
+                columns = [column[0] for column in cursor.description]
+                result_row = cursor.fetchone()
+                row = dict(zip(columns, result_row)) if result_row else {}
+            else:
+                row = {}
         finally:
             cursor.close()
-        row = dict(zip(columns, result_row))
         def _normalize_metrics(value):
             """Safely normalize DB metric values for JSON serialization."""
             if value is None:
@@ -737,11 +740,158 @@ class MssqlDataSource(SQLDataSource):
             col_metrics = {}
             for key, value in row.items():
-                if key.startswith(f"{name}_"):
-                    metric_name = key[len(name) + 1 :]
+                clean_key = key.replace("[", "").replace("]", "")
+                if clean_key.startswith(f"{name}_"):
+                    metric_name = clean_key[len(name) + 1 :]
                     col_metrics[metric_name] = _normalize_metrics(value)
             column_wise.append({"column_name": name, "metrics": col_metrics})
+        for col_data in column_wise:
+            metrics = col_data["metrics"]
+            distinct_count = metrics.get("distinct")
+            col_name = col_data["column_name"]
+            dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
+            quoted = self.quote_column(col_name)
+            is_dtype_numeric = (
+                True
+                if dtype
+                in (
+                    "int",
+                    "integer",
+                    "bigint",
+                    "smallint",
+                    "tinyint",
+                    "decimal",
+                    "numeric",
+                    "float",
+                    "real",
+                    "money",
+                    "smallmoney",
+                )
+                else False
+            )
+            if is_dtype_numeric:
+                col_min = metrics.get("min")
+                col_max = metrics.get("max")
+                if col_min is not None and col_max is not None and col_min != col_max:
+                    bucket_count = 20
+                    bucket_size = (float(col_max) - float(col_min)) / bucket_count
+                    bucket_queries = []
+                    for i in range(bucket_count):
+                        start = float(col_min) + i * bucket_size
+                        end = float(col_min) + (i + 1) * bucket_size
+                        bucket_queries.append(
+                            f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS bucket_{i}"
+                        )
+                    bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
+                    try:
+                        bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
+                        distribution = []
+                        for i in range(bucket_count):
+                            start_raw = float(col_min) + i * bucket_size
+                            end_raw = float(col_min) + (i + 1) * bucket_size
+                            if dtype in ("int", "integer", "bigint", "smallint", "tinyint"):
+                                start = math.floor(start_raw)
+                                end = math.ceil(end_raw)
+                            else:
+                                start = round(start_raw, 2)
+                                end = round(end_raw, 2)
+                            count = bucket_result[i] if bucket_result and bucket_result[i] is not None else 0
+                            distribution.append(
+                                {
+                                    "col_val": f"{start} - {end}",
+                                    "count": count,
+                                }
+                            )
+                        metrics["distribution_graph"] = distribution
+                    except Exception as e:
+                        print(f"Failed to generate numeric distribution for {col_name}: {e}")
+                continue
+            if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
+                if dtype in ("text", "ntext", "xml"):
+                    group_expr = f"CAST({quoted} AS NVARCHAR(MAX))"
+                else:
+                    group_expr = quoted
+                dist_query = (
+                    f"SELECT {group_expr}, COUNT(*) "
+                    f"FROM {qualified_table} GROUP BY {group_expr} ORDER BY COUNT(*) DESC"
+                )
+                try:
+                    dist_cursor = self.connection.cursor()
+                    dist_cursor.execute(dist_query)
+                    dist_result = dist_cursor.fetchall()
+                    dist_cursor.close()
+                    distribution = []
+                    for r in dist_result:
+                        val = _normalize_metrics(r[0])
+                        distribution.append(
+                            {
+                                "col_val": val,
+                                "count": r[1],
+                            }
+                        )
+                    metrics["distribution_graph"] = distribution
+                except Exception as e:
+                    print(f"Failed to generate distribution graph for column {col_name}: {e}")
+        for col_data in column_wise:
+            metrics = col_data["metrics"]
+            distinct_count = metrics.get("distinct")
+            col_name = col_data["column_name"]
+            dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
+            quoted = self.quote_column(col_name)
+            is_dtype_numeric = (
+                True
+                if dtype
+                in (
+                    "int",
+                    "integer",
+                    "bigint",
+                    "smallint",
+                    "tinyint",
+                    "decimal",
+                    "numeric",
+                    "float",
+                    "real",
+                    "money",
+                    "smallmoney",
+                )
+                else False
+            )
+            formatted_metrics_data = {
+                "general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
+                "is_dtype_numeric": is_dtype_numeric,
+                "distribution_data": metrics.get("distribution_graph", []),
+            }
+            col_data["metrics"] = formatted_metrics_data
         return column_wise
     def fetch_sample_values_from_database(

dcs_core/integrations/databases/postgres.py CHANGED Viewed

@@ -13,6 +13,7 @@
 #  limitations under the License.
 import datetime
+import math
 from decimal import Decimal
 from typing import Any, Dict, List, Optional, Tuple
 from uuid import UUID
@@ -411,9 +412,73 @@ class PostgresDataSource(SQLDataSource):
             col_name = col_data["column_name"]
             dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
-            if isinstance(distinct_count, (int, float)) and distinct_count < 20:
-                quoted = self.quote_column(col_name)
+            quoted = self.quote_column(col_name)
+            is_dtype_numeric = (
+                True
+                if dtype
+                in (
+                    "int",
+                    "integer",
+                    "bigint",
+                    "smallint",
+                    "decimal",
+                    "numeric",
+                    "float",
+                    "double",
+                )
+                else False
+            )
+            if is_dtype_numeric:
+                col_min = metrics.get("min")
+                col_max = metrics.get("max")
+                if col_min is not None and col_max is not None and col_min != col_max:
+                    bucket_count = 20
+                    bucket_size = (col_max - col_min) / bucket_count
+                    bucket_queries = []
+                    for i in range(bucket_count):
+                        start = col_min + i * bucket_size
+                        end = col_min + (i + 1) * bucket_size
+                        bucket_queries.append(
+                            f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS bucket_{i}"
+                        )
+                    bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
+                    try:
+                        bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
+                        distribution = []
+                        for i in range(bucket_count):
+                            start_raw = col_min + i * bucket_size
+                            end_raw = col_min + (i + 1) * bucket_size
+                            if dtype in ("int", "integer", "bigint", "smallint"):
+                                start = math.floor(start_raw)
+                                end = math.ceil(end_raw)
+                            else:
+                                start = round(start_raw, 2)
+                                end = round(end_raw, 2)
+                            count = bucket_result[i]
+                            distribution.append(
+                                {
+                                    "col_val": f"{start} - {end}",
+                                    "count": count,
+                                }
+                            )
+                        metrics["distribution_graph"] = distribution
+                    except Exception as e:
+                        print(f"Failed to generate numeric distribution for {col_name}: {e}")
+                continue
+            if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
                 if dtype in ("json", "jsonb"):
                     group_expr = f"{quoted}::text"
                 else:
@@ -444,8 +509,31 @@ class PostgresDataSource(SQLDataSource):
         for col_data in column_wise:
             metrics = col_data["metrics"]
+            distinct_count = metrics.get("distinct")
+            col_name = col_data["column_name"]
+            dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
+            quoted = self.quote_column(col_name)
+            is_dtype_numeric = (
+                True
+                if dtype
+                in (
+                    "int",
+                    "integer",
+                    "bigint",
+                    "smallint",
+                    "decimal",
+                    "numeric",
+                    "float",
+                    "double",
+                )
+                else False
+            )
             formatted_metrics_data = {
                 "general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
+                "is_dtype_numeric": is_dtype_numeric,
                 "distribution_data": metrics.get("distribution_graph", []),
             }
             col_data["metrics"] = formatted_metrics_data

dcs_sdk/__version__.py CHANGED Viewed

@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-__version__ = "1.6.4"
+__version__ = "1.6.7"

dcs_sdk/sdk/config/config_loader.py CHANGED Viewed

@@ -47,6 +47,7 @@ class SourceTargetConnection(BaseModel):
     port: Optional[Union[int, str]] = None
     driver: str
     table: Optional[str] = None
+    datasource_type: Optional[str] = None
     database: Optional[str] = None
     filepath: Optional[str] = None
     catalog: Optional[str] = None
@@ -66,6 +67,11 @@ class SourceTargetConnection(BaseModel):
     impersonate_service_account: Optional[str] = None  # bigquery specific
     bigquery_credentials: Optional[str] = None  # bigquery specific
     transform_columns: Dict[str, str] | None = None
+    account_name: Optional[str] = None
+    container_name: Optional[str] = None
+    account_key: Optional[str] = None
+    endpoint_suffix: Optional[str] = None
+    subfolder_path: Optional[str] = None
 class SimilarityConfig(BaseModel):
@@ -140,6 +146,7 @@ class DataDiffConfig:
         "mysql": "mysql",
         "sybase": "sybase",
         "bigquery": "bigquery",
+        "azure_blob": "duckdb",
     }
     def __init__(
@@ -307,6 +314,12 @@ class DataDiffConfig:
             "impersonate_service_account": connection.get("connection", {}).get("impersonate_service_account"),
             "bigquery_credentials": connection.get("connection", {}).get("bigquery_credentials"),
             "transform_columns": transform_columns,
+            "datasource_type": connection.get("type"),
+            "account_name": connection.get("connection", {}).get("account_name"),
+            "container_name": connection.get("connection", {}).get("container_name"),
+            "account_key": connection.get("connection", {}).get("account_key"),
+            "endpoint_suffix": connection.get("connection", {}).get("endpoint_suffix"),
+            "subfolder_path": connection.get("connection", {}).get("subfolder_path"),
         }
     def get_data_diff_configs(self) -> List[Comparison]:

dcs_sdk/sdk/data_diff/data_differ.py CHANGED Viewed

@@ -18,6 +18,7 @@ import time
 from collections import defaultdict
 from contextlib import suppress
 from datetime import datetime, timezone
+from pathlib import Path
 from typing import Dict, Optional
 from loguru import logger
@@ -32,9 +33,11 @@ from dcs_sdk.sdk.utils.serializer import serialize_table_schema
 from dcs_sdk.sdk.utils.table import create_table_schema_row_count, differ_rows
 from dcs_sdk.sdk.utils.themes import theme_1
 from dcs_sdk.sdk.utils.utils import (
+    azure_to_csv_file,
     calculate_column_differences,
     convert_to_masked_if_required,
     duck_db_load_csv_to_table,
+    duck_db_load_pd_to_table,
     find_identical_columns,
     generate_table_name,
     obfuscate_sensitive_data,
@@ -67,6 +70,7 @@ class DBTableDiffer:
         self.target_db: Database = None
         self.similarity = self.config.similarity
         self.similarity_providers = None
+        self.allowed_file_comparison_types = ["azure_blob"]
         if self.similarity:
             from dcs_sdk.sdk.utils.similarity_score.base_provider import (
                 ensure_nltk_data,
@@ -88,6 +92,8 @@ class DBTableDiffer:
                 "levenshtein": LevenshteinDistanceProvider,
                 "cosine": CosineSimilarityProvider,
             }
+        self.original_source_table_name = self.config.source.table
+        self.original_target_table_name = self.config.target.table
     def create_dataset_dict(
         self,
@@ -96,6 +102,7 @@ class DBTableDiffer:
         db_name: str,
         file_path: str,
         database_type: str,
+        is_file_ds: bool = False,
     ) -> Dict:
         schema_list = [serialize_table_schema(v) for v in table.get_schema().values()]
         schema_list.sort(key=lambda x: x["column_name"].upper())
@@ -106,8 +113,8 @@ class DBTableDiffer:
             "workspace": config.workspace,
             "database_type": database_type,
             "table_name": table.table_path[0],
-            "schema": table.database.default_schema,
-            "database": db_name,
+            "schema": table.database.default_schema if not is_file_ds else None,
+            "database": db_name if not is_file_ds else None,
             "primary_keys": list(table.key_columns),
             "file_path": file_path,
             "files": [] if file_path is None else [generate_table_name(csv, False) for csv in glob.glob(file_path)],
@@ -217,15 +224,50 @@ class DBTableDiffer:
             )
     def process_duckdb(self, is_source: bool):
-        if is_source:
-            filepath = self.config.source.filepath
-        else:
-            filepath = self.config.target.filepath
-        if filepath is None:
-            raise ValueError("File path is required for file")
-        if filepath.endswith(".csv"):
-            if not duck_db_load_csv_to_table(self.config, filepath, is_source):
-                raise ValueError(f"Error in loading CSV, for the {'source' if is_source else 'target'}")
+        try:
+            ds_type = self.config.source.datasource_type if is_source else self.config.target.datasource_type
+            if ds_type in self.allowed_file_comparison_types:
+                try:
+                    if ds_type == "azure_blob":
+                        df = azure_to_csv_file(self.config, is_source)
+                        name_only = (
+                            Path(self.config.source.table).stem if is_source else Path(self.config.target.table).stem
+                        )
+                        if is_source:
+                            self.config.source.table = name_only
+                        else:
+                            self.config.target.table = name_only
+                        if not duck_db_load_pd_to_table(config=self.config, is_source=is_source, df=df):
+                            raise ValueError(
+                                f"Error loading CSV into DuckDB for the {'source' if is_source else 'target'} table."
+                            )
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed processing Azure Blob for {'source' if is_source else 'target'}: {e}"
+                    ) from e
+            else:
+                try:
+                    filepath = self.config.source.filepath if is_source else self.config.target.filepath
+                    if filepath is None:
+                        raise ValueError("File path is required for file-based source.")
+                    if filepath.endswith(".csv"):
+                        if not duck_db_load_csv_to_table(self.config, filepath, is_source):
+                            raise ValueError(
+                                f"Error loading CSV into DuckDB for the {'source' if is_source else 'target'} table."
+                            )
+                    else:
+                        raise ValueError(f"Unsupported file format: {filepath}")
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed processing local file for {'source' if is_source else 'target'}: {e}"
+                    ) from e
+        except Exception as e:
+            raise RuntimeError(f"process_duckdb failed for {'source' if is_source else 'target'}: {e}") from e
     def _prepare_source_table(self) -> Optional[str]:
         view_name = None
@@ -346,6 +388,7 @@ class DBTableDiffer:
                 db1_name,
                 self.source_file_path,
                 "file" if self.config.source.driver == "duckdb" else self.config.source.driver,
+                True if self.config.source.driver == "duckdb" else False,
             )
             target_dataset = self.create_dataset_dict(
                 self.config.target,
@@ -353,6 +396,7 @@ class DBTableDiffer:
                 db2_name,
                 self.target_file_path,
                 "file" if self.config.target.driver == "duckdb" else self.config.target.driver,
+                True if self.config.target.driver == "duckdb" else False,
             )
             table_1_row_count = source_dataset.get("row_count", 0)
             table_2_row_count = target_dataset.get("row_count", 0)
@@ -690,7 +734,10 @@ class DBTableDiffer:
             self.response.update({"column_transforms": column_transforms})
             self.response.update({"schema_overrides": schema_overrides})
+            self.config.source.table = self.original_source_table_name
+            self.config.target.table = self.original_target_table_name
+            self.response["source_dataset"]["table_name"] = self.original_source_table_name
+            self.response["target_dataset"]["table_name"] = self.original_target_table_name
             return self.response
         except Exception as e:
             logger.exception(f"Error during diff_tables: {e}")

dcs_sdk/sdk/utils/utils.py CHANGED Viewed

@@ -13,12 +13,19 @@
 #  limitations under the License.
 import glob
+import io
 import os
 import uuid
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from queue import Empty, Queue
 from typing import List, Optional, Union
 import duckdb
+import pandas as pd
 import requests
+from azure.storage.blob import BlobServiceClient
+from loguru import logger
 from dcs_sdk.sdk.config.config_loader import Comparison
 from dcs_sdk.sdk.rules.rules_repository import RulesRepository
@@ -137,6 +144,134 @@ def calculate_column_differences(source_columns, target_columns, columns_mapping
     )
+def chunk_load_to_pandas(queue: Queue, result_df: list, timeout: float = 2.0):
+    """Consumer thread: read CSV chunks from queue & build final DataFrame"""
+    df = pd.DataFrame()
+    try:
+        while True:
+            try:
+                data = queue.get(timeout=timeout)
+            except Empty:
+                continue
+            if data is None:
+                break
+            try:
+                df = pd.concat([df, pd.read_csv(io.BytesIO(data))], ignore_index=True)
+            except Exception as e:
+                logger.error(f"[ERROR] Failed to read CSV chunk: {e}")
+                continue
+    except Exception as e:
+        logger.error(f"[FATAL] Consumer crashed: {e}")
+    finally:
+        result_df.append(df)
+def azure_to_csv_file(config: Comparison, is_source: bool = False) -> tuple[str, str]:
+    """Download CSV from Azure and save to local file"""
+    CHUNK_SIZE = 4 * 1024 * 1024
+    account_name = config.source.account_name if is_source else config.target.account_name
+    container_name = config.source.container_name if is_source else config.target.container_name
+    account_key = config.source.account_key if is_source else config.target.account_key
+    endpoint_suffix = config.source.endpoint_suffix if is_source else config.target.endpoint_suffix
+    table = config.source.table if is_source else config.target.table
+    connection_str = f"https://{account_name}.blob.{endpoint_suffix}"
+    blob_client = BlobServiceClient(account_url=connection_str, credential=account_key).get_blob_client(
+        container=container_name, blob=table
+    )
+    blob_size = blob_client.get_blob_properties().size
+    start = 0
+    queue = Queue()
+    result_df = []
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        executor.submit(chunk_load_to_pandas, queue, result_df)
+        all_data = b""
+        while start < blob_size:
+            end = min(start + CHUNK_SIZE - 1, blob_size - 1)
+            data = blob_client.download_blob(offset=start, length=end - start + 1).readall()
+            all_data += data
+            queue.put(data)
+            start += CHUNK_SIZE
+        queue.put(None)
+    if not result_df or len(result_df) == 0:
+        raise ValueError("No data downloaded from Azure Blob Storage")
+    return result_df[0]
+def duck_db_load_pd_to_table(config: Comparison, is_source: bool = False, df: pd.DataFrame = None) -> bool:
+    if df is None:
+        logger.error("DataFrame is None, cannot load to DuckDB")
+        return False
+    dir_name = "tmp"
+    if not os.path.exists(dir_name):
+        os.makedirs(dir_name)
+    if is_source:
+        pk_cols = config.primary_keys_source
+    else:
+        pk_cols = config.primary_keys_target
+    duck_db_file_name = f"{dir_name}/{uuid.uuid4()}.duckdb"
+    create_view = False
+    query = None
+    if is_source and config.source_query:
+        create_view = True
+        query = config.source_query
+    elif not is_source and config.target_query:
+        create_view = True
+        query = config.target_query
+    try:
+        table_name = config.source.table if is_source else config.target.table
+        conn = duckdb.connect(database=duck_db_file_name, read_only=False)
+        conn.register("df_view", df)
+        conn.execute(
+            f"""
+            CREATE OR REPLACE TABLE {table_name} AS
+            SELECT * FROM df_view;
+            """
+        )
+        if pk_cols and len(pk_cols) > 0:
+            pk_cols_str = ", ".join(pk_cols)
+            conn.execute(
+                f"""
+                CREATE INDEX idx_{table_name} ON {table_name} ({pk_cols_str});
+                """
+            )
+        if create_view:
+            view_name = f"{table_name}_query"
+            conn.execute(
+                f"""
+                CREATE VIEW {view_name} AS {query};
+                """
+            )
+        conn.unregister("df_view")
+        conn.close()
+    except Exception as e:
+        logger.error(f"Error in loading CSV to DuckDB: {e}")
+        return False
+    if is_source:
+        config.source.filepath = duck_db_file_name
+    else:
+        config.target.filepath = duck_db_file_name
+    return True
 def duck_db_load_csv_to_table(config: Comparison, path, is_source: bool = False) -> bool:
     dir_name = "tmp"
     if not os.path.exists(dir_name):
@@ -194,7 +329,7 @@ def duck_db_load_csv_to_table(config: Comparison, path, is_source: bool = False)
                 )
             conn.close()
         except Exception as e:
-            print(f"Error in loading CSV to DuckDB: {e}")
+            logger.error(f"Error in loading CSV to DuckDB: {e}")
             return False
     if is_source:

{dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dcs-sdk
-Version: 1.6.5
+Version: 1.6.7
 Summary: SDK for DataChecks
 Author: Waterdip Labs
 Author-email: hello@waterdip.ai
@@ -30,6 +30,8 @@ Provides-Extra: sybase
 Provides-Extra: trino
 Provides-Extra: vertica
 Requires-Dist: attrs (>=23.1.0)
+Requires-Dist: azure-identity (>=1.25.1,<2.0.0)
+Requires-Dist: azure-storage-blob (>=12.27.1,<13.0.0)
 Requires-Dist: click (>=8.1)
 Requires-Dist: clickhouse-driver (>=0.2.9) ; extra == "clickhouse" or extra == "all-dbs"
 Requires-Dist: cryptography (>=44.0.1) ; extra == "snowflake" or extra == "all-dbs"
@@ -84,7 +86,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
 Description-Content-Type: text/markdown
 <h1 align="center">
-  DCS SDK v1.6.4
+  DCS SDK v1.6.7
 </h1>
 > SDK for DataChecks

{dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.7.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-data_diff/__init__.py,sha256=QrrQt6GxG5gzVRlvFjJmfOzhR14fqKLrQs186KBWryY,10413
+data_diff/__init__.py,sha256=NcZ2rwvDST7cMyaaLANvNhoaFn-jC_WDg9pxDLXhZ04,10411
 data_diff/__main__.py,sha256=UvFvBKU74202bfRcIO_Wk-SU8WmnNuDK_1YVJpueMlc,16969
 data_diff/abcs/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
 data_diff/abcs/compiler.py,sha256=RuGhGlLTQuCzOJfYxa4gjcADsyvbZ9yZPuDuY6XH8Rk,785
@@ -49,9 +49,9 @@ dcs_core/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
 dcs_core/cli/cli.py,sha256=dSr3D62XhjCEn4G5Jb0O4q05G1_YAMJgaOnLqciMAmI,6020
 dcs_core/core/__init__.py,sha256=8XyOIsx-uCpaEZUgfOrb0DCdvmz1TipNQdz01h7mun0,761
 dcs_core/core/common/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
-dcs_core/core/common/errors.py,sha256=P66w4O9E8lFVeB8EtQrCkHKk034fAHkshvrxYDV_ZtE,1737
+dcs_core/core/common/errors.py,sha256=nRczSqORCjcDngAuDsqzsc3_yZQzuUX26lPov0pTE1I,2268
 dcs_core/core/common/models/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
-dcs_core/core/common/models/configuration.py,sha256=2_gGVm0z71A84XMSBok0C4yzRe-_bFhKBYtZKWF5hhw,9165
+dcs_core/core/common/models/configuration.py,sha256=cFFr_SiAqYR3NIFGfz4rJVVX-LuGu-9TJC47ghL3Tes,9396
 dcs_core/core/common/models/dashboard.py,sha256=_WV1kbs4cKlFZ5QcXyMdTmDSZLYxhvZWWWQzvHReMxM,814
 dcs_core/core/common/models/data_source_resource.py,sha256=rNvj5NjvEQi2irHYjClKBFZbp70LTX9oGCPDeFURlAI,1559
 dcs_core/core/common/models/metric.py,sha256=0Oxp7YvWZVy7zbmi4u_opBDeknsuzXmnOrK01pP2fQw,4843
@@ -64,7 +64,8 @@ dcs_core/core/configuration/configuration_parser.py,sha256=KGOJqWbOWhTacuMwM1N55
 dcs_core/core/configuration/configuration_parser_arc.py,sha256=TOoPf12pEXLdkjEGJEGV6rJOMR8yqLedla6T1x6g-Xw,14057
 dcs_core/core/datasource/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
 dcs_core/core/datasource/base.py,sha256=YD_UuGuoORFJNX30IQMk6aitiiTCHaiAddSNgUBmRtA,1935
-dcs_core/core/datasource/manager.py,sha256=uIwr9N_rcSn7P1X496of7433kj97W0w-PLlo6FVHeCw,4132
+dcs_core/core/datasource/file_datasource.py,sha256=HG4av7KUFTfH2UlAl4bqcNI6MxpbSOA26cDqxmLUqh0,913
+dcs_core/core/datasource/manager.py,sha256=3oBjIqV0YYjXubCDGVBJP_jzrv-oBgBA-octoa8Wvaw,4795
 dcs_core/core/datasource/search_datasource.py,sha256=_conk1Q_kywJhKHYyEScoKlVt_yRd05zuAISvDmXqjw,15014
 dcs_core/core/datasource/sql_datasource.py,sha256=dlX-E--hadl2q8XpMNRyZmLGC35tltBsGDzlyZqzqtw,40730
 dcs_core/core/inspect.py,sha256=QICJKcEpQClLacsfNClFoiF08M01QnJh_U2VsXRh1iA,6427
@@ -99,15 +100,16 @@ dcs_core/core/validation/uniqueness_validation.py,sha256=a6zm0_omiULKbQcDit8J913
 dcs_core/core/validation/validity_validation.py,sha256=358oAGH112oVxyPhDnfT-ypVaMAkpZ8pM73qogtdh9w,35297
 dcs_core/integrations/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
 dcs_core/integrations/databases/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
+dcs_core/integrations/databases/azure_blob.py,sha256=rOPj-dv3ZaGUrr_rLMn8xjZXuEjlzcdfZv2RcZgnbps,4674
 dcs_core/integrations/databases/bigquery.py,sha256=26RuypLMmiARZIWkV_mxtnNL2yCs94YWerSGH5Nr10Q,7337
 dcs_core/integrations/databases/databricks.py,sha256=n4fm5m_mtRCdtjLGDvbNW18u7Ev234vDBjq_lxuOxns,1978
 dcs_core/integrations/databases/db2.py,sha256=hNGivvYCitp88ouZlCxp7iRQ-vnPiK1kL8x85NyGotk,26492
 dcs_core/integrations/databases/elasticsearch.py,sha256=6CTGs1WGrfgdDRNVt9DpOB0_z_znT6YoVj10E1WY-wQ,2152
-dcs_core/integrations/databases/mssql.py,sha256=3Gpy1UIclwYRF5_dbogbb5MgHlg35ZKcEczCNqlCh3o,33258
+dcs_core/integrations/databases/mssql.py,sha256=g0MmoG8-xFphJ2oZl-q_OZ2oT6yz-lVY09JTIvIx4-0,38910
 dcs_core/integrations/databases/mysql.py,sha256=mUFLIGdbF_ktIlA19P7kq7holp5ZkRezGgN6TL_uiJ4,15815
 dcs_core/integrations/databases/opensearch.py,sha256=XeDaHRLLym3wFeA_N6RzQEHmQCI3DjD8A86Y9UKwFEM,2190
 dcs_core/integrations/databases/oracle.py,sha256=7g8Vs958tDx1v2CWFulCvuje0cLxWgU5-PVJTc1IluE,29194
-dcs_core/integrations/databases/postgres.py,sha256=gXWVPSMJQdWo2ZWpzrnc1bONRyqdiX0osdRtvJLWPSE,18133
+dcs_core/integrations/databases/postgres.py,sha256=clT1fEIVCx3fcrare16rvBe_3TYWXn6wWwPc0Y-k9Ag,21326
 dcs_core/integrations/databases/redshift.py,sha256=R9eYxpD1Ve3ChZb-gyClJ6suSljG53O6Wez2GzUW0k0,2043
 dcs_core/integrations/databases/snowflake.py,sha256=NI6sgL9iakyCbIxtj0DiqeOpF5F9ybuhtG_IwvT86Ws,1942
 dcs_core/integrations/databases/spark_df.py,sha256=pO9hSENLdrRaPvPa66yCrKS2iv5JWJBsU9XB13BBasY,3659
@@ -131,14 +133,14 @@ dcs_core/report/static/index.js,sha256=p4wvku-zlXi0y4gWeSzV1amY0s4mjtUq2QsezARLV
 dcs_core/report/static/index.js.LICENSE.txt,sha256=bBDZBJVEDrqjCi7sfoF8CchjFn3hdcbNkP7ub7kbcXQ,201041
 dcs_sdk/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
 dcs_sdk/__main__.py,sha256=Qn8stIaQGrdLjHQ-H7xO0T-brtq5RWZoWU9QvqoarV8,683
-dcs_sdk/__version__.py,sha256=0MZwU2M7klH43EtQxpbFKior602GfMQYbBVWxSs857c,633
+dcs_sdk/__version__.py,sha256=_MZd1Vn40uGUurqdUiub-zDoYZlEiNqfaaWWlEJhxps,633
 dcs_sdk/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
 dcs_sdk/cli/cli.py,sha256=jaO52UrMWLafcF_yhqllPkmYSTuO2sksFi30fYFdAB4,4406
 dcs_sdk/sdk/__init__.py,sha256=skrZcgWWJBL6NXTUERywJ3qRJRemgpDXyW7lPg1FJk8,2107
 dcs_sdk/sdk/config/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
-dcs_sdk/sdk/config/config_loader.py,sha256=oooSTV6QjbXKpCkwpl6vcBdjABGT-h99vBbWTbIkmjc,21683
+dcs_sdk/sdk/config/config_loader.py,sha256=ZbSGQ56LsHv4_mxNhYrf6eoegO2R4PaqAs8iAghU73M,22435
 dcs_sdk/sdk/data_diff/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
-dcs_sdk/sdk/data_diff/data_differ.py,sha256=zxWa-mYAdfZepNuXz1h_xxFQBC4tdhBqlbZCVEfb8Y8,36378
+dcs_sdk/sdk/data_diff/data_differ.py,sha256=00lKfGU4xMeXuS_Wpvjf-TAgMiZ7r5_bv1EQsv1EdjQ,39050
 dcs_sdk/sdk/rules/__init__.py,sha256=_BkKcE_jfdDQI_ECdOamJaefMKEXrKpYjPpnBQXl_Xs,657
 dcs_sdk/sdk/rules/rules_mappping.py,sha256=fxakVkf7B2cVkYSO946LTim_HmMsl6lBDBqZjTTsSPI,1292
 dcs_sdk/sdk/rules/rules_repository.py,sha256=x0Rli-wdnHAmXm5526go_qC3P-eFRt-4L7fs4hNqC-g,7564
@@ -152,8 +154,8 @@ dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py,sha256=Jd0TvIGOULNTsiCL_F
 dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py,sha256=puAWPnoWfNo4BN4-kXIUHrtrt5jLv3Vkw_NfHvjYrn4,1185
 dcs_sdk/sdk/utils/table.py,sha256=X8HxdYTWyx_oVrBWPsXlmA-xJKXXDBW9RrhlWNqA1As,18224
 dcs_sdk/sdk/utils/themes.py,sha256=Meo2Yldv4uyPpEqI7qdA28Aa6sxtwUU1dLKKm4QavjM,1403
-dcs_sdk/sdk/utils/utils.py,sha256=vF2zAvgt__Y8limicWTEWRyn41SBVJN81ZCTBRy6hQg,11907
-dcs_sdk-1.6.5.dist-info/METADATA,sha256=A_zRG4BkxZt8pO_JwxTTL-6Sw1jOSQ93yG8bigJCnTc,7568
-dcs_sdk-1.6.5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-dcs_sdk-1.6.5.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
-dcs_sdk-1.6.5.dist-info/RECORD,,
+dcs_sdk/sdk/utils/utils.py,sha256=i-oEiSSs8DlBni9fwBTwXEnrl8FLxzZ0dLtUGxVwLWU,16276
+dcs_sdk-1.6.7.dist-info/METADATA,sha256=0kvVpF3PedRyXnaMEYfGnD73wdulAv28pr5b8edzjUQ,7670
+dcs_sdk-1.6.7.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+dcs_sdk-1.6.7.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
+dcs_sdk-1.6.7.dist-info/RECORD,,

{dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

dcs-sdk 1.6.5__py3-none-any.whl → 1.6.7__py3-none-any.whl

dcs-sdk 1.6.5py3-none-any.whl → 1.6.7py3-none-any.whl