PyPI - dcs-sdk - Versions diffs - 1.7.0__py3-none-any.whl → 1.7.1__py3-none-any.whl - Mend

dcs-sdk 1.7.0py3-none-any.whl → 1.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

dcs_core/core/datasource/file_datasource.py CHANGED Viewed

@@ -12,19 +12,113 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-from typing import Dict
+import os
+import uuid
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Dict, Iterator
+import duckdb
+from loguru import logger
+from dcs_core.core.common.models.data_source_resource import RawColumnInfo
 from dcs_core.core.datasource.base import DataSource
+from dcs_core.integrations.databases.duck_db import DuckDb
-class FileDataSource(DataSource):
+class FileDataSource(DataSource, ABC):
     """
     Abstract class for File data sources
     """
     def __init__(self, data_source_name: str, data_connection: Dict):
         super().__init__(data_source_name, data_connection)
+        self.temp_dir_name = "tmp"
+    @contextmanager
+    def as_duckdb(self, table_name: str) -> Iterator["DuckDb"]:
+        """Returns a DuckDB instance for the given table name"""
+        duckdb_path = self.load_file_to_duckdb(table_name)
+        duck_db_ds = DuckDb(data_source_name=self.data_source_name, data_connection={"file_path": duckdb_path})
+        try:
+            duck_db_ds.connect()
+            yield duck_db_ds
+        finally:
+            duck_db_ds.close()
+    @abstractmethod
+    def query_get_table_names(self) -> dict:
+        """
+        Query to get table names
+        """
+        pass
-    def load_file_to_duckdb(self, table_name: str):
-        """Load the file to duckdb"""
+    @abstractmethod
+    def query_get_database_version(self) -> str:
+        """
+        Get the database version
+        :return: version string
+        """
         pass
+    @abstractmethod
+    def _download_to_path(self, table_name: str, path: str) -> None:
+        """Vendor-specific download"""
+        pass
+    def load_file_to_duckdb(self, table_name: str) -> str:
+        """Template method"""
+        os.makedirs(self.temp_dir_name, exist_ok=True)
+        ext = Path(table_name).suffix
+        if not ext:
+            raise ValueError(f"Invalid file name {table_name}")
+        temp_path = f"{self.temp_dir_name}/{uuid.uuid4()}{ext}"
+        try:
+            self._download_to_path(table_name, temp_path)
+            return self._load_path_to_duckdb(temp_path, table_name)
+        finally:
+            if os.path.exists(temp_path):
+                os.remove(temp_path)
+                logger.info(f"Cleaned up temp file {temp_path}")
+    def _load_path_to_duckdb(self, path: str, table_name: str) -> str:
+        """Shared DuckDB loading logic"""
+        tmp_dir = self.temp_dir_name
+        duckdb_path = f"{tmp_dir}/{uuid.uuid4()}.duckdb"
+        table_stem = Path(table_name).stem
+        logger.info(f"Loading {path} into DuckDB")
+        conn = None
+        try:
+            conn = duckdb.connect(database=duckdb_path, read_only=False)
+            conn.execute(
+                f'CREATE TABLE "{table_stem}" AS SELECT * FROM read_csv_auto(?)',
+                [path],
+            )
+            logger.info(f"Successfully loaded data into {duckdb_path}")
+            return duckdb_path
+        except Exception as e:
+            logger.warning(f"read_csv_auto failed: {e}. Trying with ALL_VARCHAR=TRUE")
+            try:
+                if conn:
+                    conn.close()
+                conn = duckdb.connect(database=duckdb_path, read_only=False)
+                conn.execute(
+                    f'CREATE TABLE "{table_stem}" AS ' f"SELECT * FROM read_csv(?, ALL_VARCHAR=TRUE, SAMPLE_SIZE=-1)",
+                    [path],
+                )
+                logger.info(f"Successfully loaded data with ALL_VARCHAR into {duckdb_path}")
+                return duckdb_path
+            except Exception as fallback_error:
+                logger.error(f"Failed to load CSV into DuckDB: {fallback_error}")
+                if os.path.exists(duckdb_path):
+                    os.remove(duckdb_path)
+                raise
+        finally:
+            if conn:
+                conn.close()

dcs_core/integrations/databases/azure_blob.py CHANGED Viewed

@@ -12,16 +12,12 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-import io
 import os
 import uuid
-from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from queue import Empty, Queue
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 import duckdb
-import pandas as pd
 from azure.storage.blob import BlobServiceClient
 from loguru import logger
@@ -30,6 +26,7 @@ from dcs_core.core.common.errors import (
     DataChecksDataSourcesConnectionError,
     DatachecksTableFetchError,
 )
+from dcs_core.core.common.models.data_source_resource import RawColumnInfo
 from dcs_core.core.datasource.file_datasource import FileDataSource
@@ -38,6 +35,7 @@ class AzureBlobDataSource(FileDataSource):
         super().__init__(data_source_name, data_connection)
         self.allowed_file_extensions = [".csv"]
         self.blob_service_client: Optional[BlobServiceClient] = None
+        self.DEFAULT_NUMERIC_PRECISION = 16383
         self.connection = None
     def connect(self) -> Any:
@@ -90,28 +88,8 @@ class AzureBlobDataSource(FileDataSource):
         except Exception as e:
             raise DatachecksTableFetchError(f"Failed to list blobs: {e}")
-    def query_get_table_columns(self, table: str) -> List[dict]:
-        """
-        Get column names for a table (CSV blob in this case).
-        """
-        if not self.is_connected():
-            raise DataChecksDataSourcesConnectionError("Not connected to Azure Blob Storage")
-        if not any(table.endswith(ext) for ext in self.allowed_file_extensions):
-            raise ValueError(f"Unsupported file type for {table}. Allowed: {self.allowed_file_extensions}")
-        try:
-            blob_client = self.connection.get_blob_client(blob=table)
-            download_stream = blob_client.download_blob()
-            data = download_stream.readall()
-            if table.endswith(".csv"):
-                df = pd.read_csv(io.BytesIO(data))
-            else:
-                raise ValueError(f"Unsupported file type for {table}. Allowed: {self.allowed_file_extensions}")
-            return [{"column_name": col, "column_type": "string"} for col in df.columns.tolist()]
-        except Exception as e:
-            raise DatachecksColumnFetchError(f"Failed to read columns from blob '{table}': {e}")
+    def safe_get(self, lst, idx, default=None):
+        return lst[idx] if 0 <= idx < len(lst) else default
     def query_get_database_version(self) -> str:
         """
@@ -121,97 +99,14 @@ class AzureBlobDataSource(FileDataSource):
         api_version = self.blob_service_client.api_version
         return api_version
-    def _chunk_load_to_pandas(self, queue: Queue, result_df: list, timeout: float = 2.0):
-        """Consumer thread: read CSV chunks from queue & build final DataFrame"""
-        df = pd.DataFrame()
-        try:
-            while True:
-                try:
-                    data = queue.get(timeout=timeout)
-                except Empty:
-                    continue
-                if data is None:
-                    break
-                try:
-                    chunk = pd.read_csv(io.BytesIO(data), dtype=str)
-                    df = pd.concat([df, chunk], ignore_index=True)
-                except Exception as e:
-                    logger.error(f"[ERROR] Failed to read CSV chunk: {e}")
-                    continue
-        except Exception as e:
-            logger.error(f"[FATAL] Consumer crashed: {e}")
-        finally:
-            result_df.append(df)
-    def _load_blob_to_pandas(self, table_name: str):
+    def _download_to_path(self, table_name: str, path: str):
+        """Download blob to path"""
         blob_client = self.connection.get_blob_client(blob=table_name)
-        CHUNK_SIZE = 4 * 1024 * 1024
-        blob_size = blob_client.get_blob_properties().size
-        start = 0
-        queue = Queue()
-        result_df = []
-        with ThreadPoolExecutor(max_workers=1) as executor:
-            executor.submit(self._chunk_load_to_pandas, queue, result_df)
-            all_data = b""
-            while start < blob_size:
-                end = min(start + CHUNK_SIZE - 1, blob_size - 1)
-                data = blob_client.download_blob(offset=start, length=end - start + 1).readall()
-                all_data += data
-                queue.put(data)
-                start += CHUNK_SIZE
-            queue.put(None)
-        if not result_df or len(result_df) == 0:
-            raise ValueError("No data downloaded from Azure Blob Storage")
-        return result_df[0]
-    def _load_pd_to_duckdb(self, df: pd.DataFrame, table_name: str):
-        dir_name = "tmp"
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name)
-        duck_db_file_name = f"{dir_name}/{uuid.uuid4()}.duckdb"
-        file_path = None
+        logger.info(f"Downloading {table_name} to {path}")
         try:
-            table_name = table_name
-            conn = duckdb.connect(database=duck_db_file_name, read_only=False)
-            file_path = duck_db_file_name
-            conn.register("df_view", df)
-            conn.execute(
-                f"""
-                CREATE OR REPLACE TABLE "{table_name}" AS
-                SELECT * FROM df_view;
-                """
-            )
-            conn.unregister("df_view")
-            conn.close()
+            with open(path, "wb") as f:
+                stream = blob_client.download_blob()
+                for chunk in stream.chunks():
+                    f.write(chunk)
         except Exception as e:
-            logger.error(f"Error in loading CSV to DuckDB: {e}")
-            raise
-        return file_path
-    def load_file_to_duckdb(self, table_name: str):
-        logger.info(f"Loading {table_name} to pandas")
-        df: pd.DataFrame = self._load_blob_to_pandas(table_name)
-        if df is None or df.empty:
-            raise ValueError("No data downloaded from Azure Blob Storage")
-        name_only = Path(table_name).stem
-        logger.info(f"Loading {table_name} to duckdb")
-        file_path = self._load_pd_to_duckdb(df, name_only)
-        return file_path
+            raise DataChecksDataSourcesConnectionError(f"Failed to download blob '{table_name}': {e}")

dcs_core/integrations/databases/duck_db.py CHANGED Viewed

@@ -12,12 +12,14 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 import os
+from pathlib import Path
 from typing import Any, Dict
 import duckdb
 from loguru import logger
 from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
+from dcs_core.core.common.models.data_source_resource import RawColumnInfo
 from dcs_core.core.datasource.sql_datasource import SQLDataSource
@@ -26,6 +28,20 @@ class DuckDb(SQLDataSource):
         super().__init__(data_source_name, data_connection)
         self.connection = None
         self.use_sa_text_query = False
+        self.regex_patterns = {
+            "uuid": r"^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
+            "usa_phone": r"^(\+1[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}$",
+            "email": r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$",
+            "usa_zip_code": r"^[0-9]{5}(?:-[0-9]{4})?$",
+            "ssn": r"^[0-9]{3}-[0-9]{2}-[0-9]{4}$",
+            "sedol": r"^[B-DF-HJ-NP-TV-XZ0-9]{6}[0-9]$",
+            "lei": r"^[A-Z0-9]{18}[0-9]{2}$",
+            "cusip": r"^[0-9A-Z]{9}$",
+            "figi": r"^BBG[A-Z0-9]{9}$",
+            "isin": r"^[A-Z]{2}[A-Z0-9]{9}[0-9]$",
+            "perm_id": r"^\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{3}$",
+        }
+        self.DEFAULT_NUMERIC_PRECISION = 16383
     def connect(self) -> Any:
         """
@@ -49,9 +65,12 @@ class DuckDb(SQLDataSource):
         Close the connection
         """
         logger.info("Closing DuckDB connection")
-        self.connection.close()
+        if self.connection:
+            self.connection.close()
         try:
-            os.remove(self.data_connection.get("file_path"))
+            fp = self.data_connection.get("file_path")
+            if fp and os.path.exists(fp):
+                os.remove(fp)
         except Exception as e:
             logger.error(f"Failed to remove the file {self.data_connection.get('file_path')}: {e}")
@@ -70,3 +89,53 @@ class DuckDb(SQLDataSource):
         :return: quoted column name
         """
         return f'"{column}"'
+    def query_get_table_columns(
+        self,
+        table: str,
+        schema: str | None = None,
+    ) -> Dict[str, RawColumnInfo]:
+        """
+        Get the schema of a table.
+        :param table: table name
+        :return:  Dictionary with column names and their types
+        """
+        schema = schema or self.schema_name
+        info_schema_path = ["information_schema", "columns"]
+        if self.database:
+            database = self.quote_database(self.database)
+            info_schema_path.insert(0, database)
+        query = f"""
+            SELECT
+                column_name,
+                data_type,
+                CASE WHEN data_type IN ('TIMESTAMP', 'TIME') THEN datetime_precision ELSE NULL END AS datetime_precision,
+                CASE WHEN data_type = 'DECIMAL' THEN COALESCE(numeric_precision, 131072 + {self.DEFAULT_NUMERIC_PRECISION})
+                     WHEN data_type IN ('DOUBLE', 'REAL', 'FLOAT') THEN numeric_precision
+                     ELSE numeric_precision END AS numeric_precision,
+                CASE WHEN data_type = 'DECIMAL' THEN COALESCE(numeric_scale, {self.DEFAULT_NUMERIC_PRECISION}) ELSE numeric_scale END AS numeric_scale,
+                NULL AS collation_name,
+                CASE WHEN data_type = 'VARCHAR' THEN character_maximum_length ELSE NULL END AS character_maximum_length
+            FROM information_schema.columns
+            WHERE table_name = '{table}'
+            ORDER BY ordinal_position
+        """
+        rows = self.fetchall(query)
+        if not rows:
+            raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
+        column_info = {
+            r[0]: RawColumnInfo(
+                column_name=self.safe_get(r, 0),
+                data_type=self.safe_get(r, 1),
+                datetime_precision=self.safe_get(r, 2),
+                numeric_precision=self.safe_get(r, 3),
+                numeric_scale=self.safe_get(r, 4),
+                collation_name=self.safe_get(r, 5),
+                character_maximum_length=self.safe_get(r, 6),
+            )
+            for r in rows
+        }
+        return column_info

dcs_sdk/__version__.py CHANGED Viewed

@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-__version__ = "1.7.0"
+__version__ = "1.7.1"

{dcs_sdk-1.7.0.dist-info → dcs_sdk-1.7.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dcs-sdk
-Version: 1.7.0
+Version: 1.7.1
 Summary: SDK for DataChecks
 Author: Waterdip Labs
 Author-email: hello@waterdip.ai
@@ -86,7 +86,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
 Description-Content-Type: text/markdown
 <h1 align="center">
-  DCS SDK v1.7.0
+  DCS SDK v1.7.1
 </h1>
 > SDK for DataChecks

{dcs_sdk-1.7.0.dist-info → dcs_sdk-1.7.1.dist-info}/RECORD RENAMED Viewed

@@ -64,7 +64,7 @@ dcs_core/core/configuration/configuration_parser.py,sha256=ue7tzWkOpamhXw_DJhr5Z
 dcs_core/core/configuration/configuration_parser_arc.py,sha256=TOoPf12pEXLdkjEGJEGV6rJOMR8yqLedla6T1x6g-Xw,14057
 dcs_core/core/datasource/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
 dcs_core/core/datasource/base.py,sha256=YD_UuGuoORFJNX30IQMk6aitiiTCHaiAddSNgUBmRtA,1935
-dcs_core/core/datasource/file_datasource.py,sha256=ppjGOtzSaBV3DWIute0dvOHQxtfULgeIefsDFW3xZz8,1017
+dcs_core/core/datasource/file_datasource.py,sha256=_uwxnunv8bF9IzKF3oC-lHeaG1mmQBsbQbgRjPAn208,4349
 dcs_core/core/datasource/manager.py,sha256=cuh6XAOCxn2b9SQxYwYurgBb6WUD8ZS6KRIg3FAloYU,4824
 dcs_core/core/datasource/search_datasource.py,sha256=_conk1Q_kywJhKHYyEScoKlVt_yRd05zuAISvDmXqjw,15014
 dcs_core/core/datasource/sql_datasource.py,sha256=dlX-E--hadl2q8XpMNRyZmLGC35tltBsGDzlyZqzqtw,40730
@@ -100,11 +100,11 @@ dcs_core/core/validation/uniqueness_validation.py,sha256=a6zm0_omiULKbQcDit8J913
 dcs_core/core/validation/validity_validation.py,sha256=358oAGH112oVxyPhDnfT-ypVaMAkpZ8pM73qogtdh9w,35297
 dcs_core/integrations/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
 dcs_core/integrations/databases/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
-dcs_core/integrations/databases/azure_blob.py,sha256=XjFLWuWYoWh5DGow1zgYypHvGtP7qN2oFwlvRth2aQ0,7973
+dcs_core/integrations/databases/azure_blob.py,sha256=Pzm7ygZVe5DnpuggpAbWGA-kICJ2reMaefR5I9nGyeo,4399
 dcs_core/integrations/databases/bigquery.py,sha256=26RuypLMmiARZIWkV_mxtnNL2yCs94YWerSGH5Nr10Q,7337
 dcs_core/integrations/databases/databricks.py,sha256=n4fm5m_mtRCdtjLGDvbNW18u7Ev234vDBjq_lxuOxns,1978
 dcs_core/integrations/databases/db2.py,sha256=hNGivvYCitp88ouZlCxp7iRQ-vnPiK1kL8x85NyGotk,26492
-dcs_core/integrations/databases/duck_db.py,sha256=J4ReqFmUFfbYbZlvjmjZfHiD5CkgX4TnzqljGf5wJCQ,2422
+dcs_core/integrations/databases/duck_db.py,sha256=X4FRSsobOFCIi329cYofQsMd_fkRI4KxC8BIrtiDz4g,5531
 dcs_core/integrations/databases/elasticsearch.py,sha256=6CTGs1WGrfgdDRNVt9DpOB0_z_znT6YoVj10E1WY-wQ,2152
 dcs_core/integrations/databases/mssql.py,sha256=g0MmoG8-xFphJ2oZl-q_OZ2oT6yz-lVY09JTIvIx4-0,38910
 dcs_core/integrations/databases/mysql.py,sha256=mUFLIGdbF_ktIlA19P7kq7holp5ZkRezGgN6TL_uiJ4,15815
@@ -134,7 +134,7 @@ dcs_core/report/static/index.js,sha256=p4wvku-zlXi0y4gWeSzV1amY0s4mjtUq2QsezARLV
 dcs_core/report/static/index.js.LICENSE.txt,sha256=bBDZBJVEDrqjCi7sfoF8CchjFn3hdcbNkP7ub7kbcXQ,201041
 dcs_sdk/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
 dcs_sdk/__main__.py,sha256=Qn8stIaQGrdLjHQ-H7xO0T-brtq5RWZoWU9QvqoarV8,683
-dcs_sdk/__version__.py,sha256=eNGN1UbpUzU92QJAufllEa8THGec2TPMG38N7GY-Rc8,633
+dcs_sdk/__version__.py,sha256=QCNpj_HNk0BPKbYaFGZRdWUw410tRn837AoNnQ040wI,633
 dcs_sdk/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
 dcs_sdk/cli/cli.py,sha256=jaO52UrMWLafcF_yhqllPkmYSTuO2sksFi30fYFdAB4,4406
 dcs_sdk/sdk/__init__.py,sha256=skrZcgWWJBL6NXTUERywJ3qRJRemgpDXyW7lPg1FJk8,2107
@@ -156,7 +156,7 @@ dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py,sha256=puAWP
 dcs_sdk/sdk/utils/table.py,sha256=X8HxdYTWyx_oVrBWPsXlmA-xJKXXDBW9RrhlWNqA1As,18224
 dcs_sdk/sdk/utils/themes.py,sha256=Meo2Yldv4uyPpEqI7qdA28Aa6sxtwUU1dLKKm4QavjM,1403
 dcs_sdk/sdk/utils/utils.py,sha256=a9QGEVL8L7asbJm_VBwgKvJQknsvuqWS0uTUaHsDPiY,16463
-dcs_sdk-1.7.0.dist-info/METADATA,sha256=y4bCqzJK1nn3mIkDJ8uAQwfgHNR_Gytw_X3hic0Jhi0,7652
-dcs_sdk-1.7.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-dcs_sdk-1.7.0.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
-dcs_sdk-1.7.0.dist-info/RECORD,,
+dcs_sdk-1.7.1.dist-info/METADATA,sha256=yp3uvwoERa7LCAifpIFs80wist4U3tjyE8D24a23NlM,7652
+dcs_sdk-1.7.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+dcs_sdk-1.7.1.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
+dcs_sdk-1.7.1.dist-info/RECORD,,

{dcs_sdk-1.7.0.dist-info → dcs_sdk-1.7.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{dcs_sdk-1.7.0.dist-info → dcs_sdk-1.7.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

dcs-sdk 1.7.0__py3-none-any.whl → 1.7.1__py3-none-any.whl

dcs-sdk 1.7.0py3-none-any.whl → 1.7.1py3-none-any.whl