PyPI - dataengineer_toolbox - Versions diffs - 0.0.4__tar.gz - Mend

dataengineer_toolbox 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

dataengineer_toolbox-0.0.4/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Dac Toan Ho
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

dataengineer_toolbox-0.0.4/MANIFEST.in ADDED Viewed

@@ -0,0 +1,8 @@
+include README.md
+include LICENSE
+include pyproject.toml
+recursive-include dataeng_toolbox *.py
+recursive-include tests *.py
+recursive-include examples *.py *.md
+exclude .gitignore
+exclude .github/workflows/*.yml

dataengineer_toolbox-0.0.4/PKG-INFO ADDED Viewed

@@ -0,0 +1,100 @@
+Metadata-Version: 2.4
+Name: dataengineer_toolbox
+Version: 0.0.4
+Summary: A comprehensive data engineering toolbox for Python
+Author-email: Your Name <your.email@example.com>
+License: MIT License
+        Copyright (c) 2025 Dac Toan Ho
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Homepage, https://github.com/yourusername/dataengineer_toolbox
+Project-URL: Repository, https://github.com/yourusername/dataengineer_toolbox
+Project-URL: Issues, https://github.com/yourusername/dataengineer_toolbox/issues
+Keywords: data engineering,toolbox,python
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pyspark==3.5.7
+Requires-Dist: pydantic>=2.0
+Requires-Dist: pydantic-core>=2.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0; extra == "dev"
+Requires-Dist: black>=23.0; extra == "dev"
+Requires-Dist: flake8>=6.0; extra == "dev"
+Requires-Dist: mypy>=1.0; extra == "dev"
+Dynamic: license-file
+# DataEngineerToolbox
+1. Set up virtual environment
+```
+python -m venv venv
+venv\Scripts\activate
+```
+2. Install build dependencies
+```
+pip install build
+```
+3. Install package dependencies
+```
+pip install -e .
+```
+4. Build the wheel
+```
+python -m build
+```
+This will generate two files in the dist folder:
+- dataeng_toolbox-0.1.0.tar.gz (source distribution)
+- dataeng_toolbox-0.1.0-py3-none-any.whl (wheel)
+5. (Optional) Publish to PyPI
+```
+pip install twine
+twine upload dist/*
+```
+You will be prompted for your PyPI username and password.
+# Run Unit Tests
+## Run with verbose output and coverage
+pytest --cov=dataeng_toolbox
+## Run specific test file
+pytest tests\test_vtable_model.py
+## Run specific test class
+pytest tests\test_vtable_serialization.py::TestVTableListSerialization
+## Run specific test method
+pytest tests\test_vtable_serialization.py::TestVTableListSerialization::test_dumps_returns_string

dataengineer_toolbox-0.0.4/README.md ADDED Viewed

@@ -0,0 +1,45 @@
+# DataEngineerToolbox
+1. Set up virtual environment
+```
+python -m venv venv
+venv\Scripts\activate
+```
+2. Install build dependencies
+```
+pip install build
+```
+3. Install package dependencies
+```
+pip install -e .
+```
+4. Build the wheel
+```
+python -m build
+```
+This will generate two files in the dist folder:
+- dataeng_toolbox-0.1.0.tar.gz (source distribution)
+- dataeng_toolbox-0.1.0-py3-none-any.whl (wheel)
+5. (Optional) Publish to PyPI
+```
+pip install twine
+twine upload dist/*
+```
+You will be prompted for your PyPI username and password.
+# Run Unit Tests
+## Run with verbose output and coverage
+pytest --cov=dataeng_toolbox
+## Run specific test file
+pytest tests\test_vtable_model.py
+## Run specific test class
+pytest tests\test_vtable_serialization.py::TestVTableListSerialization
+## Run specific test method
+pytest tests\test_vtable_serialization.py::TestVTableListSerialization::test_dumps_returns_string

dataengineer_toolbox-0.0.4/dataeng_toolbox/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""
+DataEng Toolbox - A comprehensive data engineering toolbox for Python.
+This package provides utilities and tools commonly used in data engineering workflows.
+"""
+__version__ = "0.1.0"
+__author__ = "Your Name"
+__email__ = "your.email@example.com"
+from .data_loader import DataLoader
+__all__ = ["DataLoader"]

dataengineer_toolbox-0.0.4/dataeng_toolbox/built_in/scd_test.py ADDED Viewed

File without changes

dataengineer_toolbox-0.0.4/dataeng_toolbox/core/__init__.py ADDED Viewed

@@ -0,0 +1,70 @@
+# ---------------------------------------------------------------------------
+# File History
+# ---------------------------------------------------------------------------
+# 2026-02-28  Initial creation
+# ---------------------------------------------------------------------------
+"""
+Core module for DataEng Toolbox.
+This module contains the main Core class with essential functionality.
+"""
+from typing import Dict
+from unicodedata import name
+from dataeng_toolbox.model import CloudProvider, PlatformType
+class BasePlatform:
+    def __init__(self, spark, sparkutils) -> None:
+        self.spark = spark
+        self.sparkutils = sparkutils
+    def get_spark(self):
+        return self.spark
+    def get_sparkutils(self):
+        return self.sparkutils
+class DatabricksPlatform(BasePlatform):
+    def __init__(self, spark, dbutils, cloud_provider = CloudProvider.AZURE) -> None:
+        super().__init__(spark, dbutils)
+        self.cloud_provider = cloud_provider
+class FabricPlatform(BasePlatform):
+    def __init__(self, spark, dbutils) -> None:
+        super().__init__(spark, dbutils)
+class Context:
+    def __init__(self, platform: BasePlatform,  logger) -> None:
+        self.__platform__ = platform
+        self.__logger__ = logger
+        self.__custom_properties__ = {}
+    def get_platform(self) -> BasePlatform:
+        return self.__platform__
+    def get_logger(self):
+        return self.__logger__
+    def set_property(self, key: str, value):
+        """Set a custom property in the context."""
+        self.__custom_properties__[key] = value
+    def get_property(self, key: str):
+        """Get a custom property from the context."""
+        return self.__custom_properties__.get(key, None)
+class PlatformFactory:
+    @staticmethod
+    def create_platform(platform_type: PlatformType, spark=None, dbutils=None):
+        """Factory method to create platform instances."""
+        if platform_type == PlatformType.DATABRICKS:
+            return DatabricksPlatform(spark, dbutils)
+        elif platform_type == PlatformType.FABRIC:
+            # Implement Fabric platform initialization here
+            return FabricPlatform(spark, dbutils)
+        else:
+            raise ValueError(f"Unsupported platform type: {platform_type}")

dataengineer_toolbox-0.0.4/dataeng_toolbox/data_loader.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""
+DataLoader module for managing data loading operations.
+This module provides a singleton DataLoader class for consistent data loading
+across the application.
+"""
+from typing import Optional, Any, Dict
+class DataLoader:
+    """
+    Singleton class for managing data loading operations.
+    This class ensures only one instance exists throughout the application lifecycle,
+    providing a centralized point for data loading functionality.
+    """
+    _instance: Optional['DataLoader'] = None
+    def __new__(cls) -> 'DataLoader':
+        """
+        Create or return the existing singleton instance.
+        Returns:
+            DataLoader: The singleton instance of DataLoader.
+        """
+        if cls._instance is None:
+            cls._instance = super(DataLoader, cls).__new__(cls)
+            cls._instance._initialized = False
+        return cls._instance
+    def __init__(self) -> None:
+        """Initialize the DataLoader singleton."""
+        if self._initialized:
+            return
+        self._initialized = True
+        self._cache: Dict[str, Any] = {}
+        self._config: Dict[str, Any] = {}
+    def load_data(self, source: str) -> Any:
+        """
+        Load data from a specified source.
+        Args:
+            source (str): The data source path or identifier.
+        Returns:
+            Any: The loaded data.
+        """
+        if source in self._cache:
+            return self._cache[source]
+        # TODO: Implement actual data loading logic
+        data = None
+        self._cache[source] = data
+        return data
+    def set_config(self, config: Dict[str, Any]) -> None:
+        """
+        Set configuration for the DataLoader.
+        Args:
+            config (Dict[str, Any]): Configuration dictionary.
+        """
+        self._config.update(config)
+    def get_config(self) -> Dict[str, Any]:
+        """
+        Get the current configuration.
+        Returns:
+            Dict[str, Any]: The current configuration dictionary.
+        """
+        return self._config.copy()
+    def clear_cache(self) -> None:
+        """Clear the data cache."""
+        self._cache.clear()
+    def reset(self) -> None:
+        """
+        Reset the singleton instance.
+        This is useful for testing purposes.
+        """
+        DataLoader._instance = None

dataengineer_toolbox-0.0.4/dataeng_toolbox/entity.py ADDED Viewed

@@ -0,0 +1,64 @@
+from typing import Union
+from pyspark.sql.types import StructType, StructField
+from pyspark.sql import DataFrame
+from dataeng_toolbox.model import ColumnModel
+from dataeng_toolbox.model import ScdType, Context, VTableModel
+from abc import ABC, abstractmethod
+class BaseEntity(ABC):
+    """Base class for all entities."""
+    def __init__(self, context: Context,  scd_type: ScdType) -> None:
+        self._scd_type = scd_type
+        self._context = context
+    def get_scd_type(self) -> ScdType:
+        """Get the SCD type of the entity."""
+        return self._scd_type
+    def get_context(self) -> Context:
+        """Get the context of the entity."""
+        return self._context
+    def get_schema(self) -> StructType | None:
+        """Get the schema for the entity."""
+        raise NotImplementedError("Subclasses must implement this method.")
+    @abstractmethod
+    def apply_transformations(self) -> DataFrame:
+        """Apply transformations to the DataFrame."""
+        raise NotImplementedError("Subclasses must implement this method.")
+    def apply_deletions(self) -> DataFrame:
+        """Apply deletions to the DataFrame."""
+        raise NotImplementedError("Subclasses must implement this method.")
+    def initalize_state(self) -> None:
+        """Initialize any state or dependencies for the entity."""
+        pass  # Optional to implement in subclasses
+    def finalize_state(self) -> None:
+        """Finalize any state or dependencies for the entity."""
+        pass  # Optional to implement in subclasses
+class SilverEntity(BaseEntity):
+    def __init__(self, context: Context, scd_type: ScdType) -> None:
+        super().__init__(context, scd_type)
+        self._context = context
+    def _get_dependencies(self) -> list[VTableModel]:
+        """Get the list of dependency entities for the bronze entity."""
+        return []
+    def _load_dependencies(self) -> None:
+        """Load dependencies for the bronze entity."""
+        dependencies = self._get_dependencies()
+        for dependency in dependencies:
+            pass  # Implement loading logic here
+    def get_schema(self) -> list[ColumnModel]:
+        """Get the schema for the silver entity."""
+        return []

dataengineer_toolbox-0.0.4/dataeng_toolbox/model.py ADDED Viewed

@@ -0,0 +1,110 @@
+from enum import Enum
+from pyspark.sql.types import StructField
+from pydantic import BaseModel, ConfigDict, field_validator, model_validator
+class Constants:
+    METADATA_IDENTITY_KEY = "identity"
+    METADATA_DATA_HASH = "data_hash"
+    METADATA_KEY_HASH = "key_hash"
+    DEFAULT_SCD2_EFFECTIVE_DATE_COL = "EffectiveDate"
+    DEFAULT_SCD2_END_DATE_COL = "EndDate"
+    DEFAULT_SCD2_IS_CURRENT_COL = "IsCurrent"
+    DEFAULT_SCD2_CURRENT_FLAG_VALUE = True
+    DEFAULT_SCD2_END_DATE_FAR_FUTURE = "9999-12-31"
+class ScdType(Enum):
+    UNDEFINED = 0
+    SCD0 = 1
+    SCD1 = 2
+    SCD2 = 3
+class TableType(Enum):
+    UNDEFINED = 0
+    MANAGED = 1
+    EXTERNAL = 2
+class FileType(Enum):
+    UNDEFINED = 0
+    CSV = 1
+    PARQUET = 2
+    DELTA = 3
+    JSON = 4
+class IngestionType(Enum):
+    UNDEFINED = 0
+    FULL_LOAD = 1
+    INCREMENTAL = 2
+class PlatformType(Enum):
+    UNDEFINED = 0
+    DATABRICKS = 1
+    FABRIC = 2
+class CloudProvider(Enum):
+    UNDEFINED = 0
+    AWS = 1
+    AZURE = 2
+    GCP = 3
+class ColumnModel(StructField):
+    def __init__(self, *arg, **kwargs) -> None:
+        super().__init__(*arg, **kwargs)
+    def is_identity(self) -> bool:
+        """Check if the column is an identity column."""
+        if Constants.METADATA_IDENTITY_KEY in self.metadata:
+            return self.metadata[Constants.METADATA_IDENTITY_KEY] is True
+        return False
+class VFileModel(BaseModel):
+    """Pydantic model for representing a virtual file."""
+    model_config = ConfigDict(frozen=False, validate_assignment=True)
+    catalog: str | None = None
+    namespace: str | None = None
+    name: str
+    file_path: str
+    file_type: FileType = FileType.UNDEFINED
+class VTableModel(VFileModel):
+    """Pydantic model for representing a virtual table."""
+    file_path: str | None = None
+    table_type: TableType = TableType.UNDEFINED
+    @model_validator(mode="after")
+    def validate_external_requires_delta(self) -> "VTableModel":
+        """EXTERNAL tables must use DELTA file type."""
+        if self.table_type == TableType.EXTERNAL and self.file_type != FileType.DELTA:
+            raise ValueError(
+                f"EXTERNAL tables must have FileType.DELTA, got {self.file_type}"
+            )
+        return self
+def main() -> None:
+    """Simple demo entrypoint for the module.
+    Creates example VTableModel instances and prints their serialized forms.
+    """
+    v1 = VTableModel(catalog="main", namespace="sales", name="orders")
+    v2 = VTableModel(catalog="main", namespace="inventory", name="products", table_type=TableType.MANAGED)
+    print("Example VTableModel v1:")
+    print(v1)
+    print("model_dump:", v1.model_dump())
+    print("model_dump_json:", v1.model_dump_json())
+    print("\nExample VTableModel v2 (managed):")
+    print(v2)
+    print("model_dump:", v2.model_dump())
+    print("model_dump_json:", v2.model_dump_json())
+if __name__ == "__main__":
+    main()

dataengineer_toolbox-0.0.4/dataeng_toolbox/spark_utils.py ADDED Viewed

@@ -0,0 +1,170 @@
+from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql.functions import expr
+from dataeng_toolbox.model import Constants, FileType
+from dataeng_toolbox.utils import get_logger
+logger = get_logger(__name__)
+def scd_type1(spark: SparkSession, target_table: str, source_df: DataFrame,
+              composite_keys: list, scd_columns: list) -> None:
+    """
+    Implements SCD Type 1 using Spark MERGE INTO.
+    Updates existing records with new values, inserts new records.
+    Args:
+        spark: SparkSession
+        target_table: Target table name
+        source_df: Source Spark DataFrame
+        composite_keys: List of composite key columns for matching
+        scd_columns: List of columns to track changes
+    """
+    source_df.createOrReplaceTempView("source")
+    join_condition = " AND ".join([f"target.{col} = source.{col}" for col in composite_keys])
+    update_set = ", ".join([f"target.{col} = source.{col}" for col in scd_columns])
+    insert_columns = ", ".join(composite_keys + scd_columns)
+    insert_values = ", ".join([f"source.{col}" for col in composite_keys + scd_columns])
+    merge_sql = f"""
+    MERGE INTO {target_table} target
+    USING source
+    ON {join_condition}
+    WHEN MATCHED THEN
+        UPDATE SET {update_set}
+    WHEN NOT MATCHED THEN
+        INSERT ({insert_columns})
+        VALUES ({insert_values})
+    """
+    logger.info(f"Executing SCD Type 1 MERGE SQL:\n{merge_sql}")
+    spark.sql(merge_sql)
+def scd_type1_with_hash(spark: SparkSession, target_table: str, source_df: DataFrame,
+              composite_keys: list, scd_columns: list, add_key_hash: bool = False,
+              add_data_hash: bool = False, identity_column: str = None) -> None:
+    """
+    Implements SCD Type 1 using Spark MERGE INTO.
+    Updates existing records with new values, inserts new records.
+    Args:
+        spark: SparkSession
+        target_table: Target table name
+        source_df: Source Spark DataFrame
+        composite_keys: List of composite key columns for matching
+        scd_columns: List of columns to track changes
+        add_key_hash: Whether to add a hash column for the composite key
+        add_data_hash: Whether to add a hash column for the SCD columns
+        identity_column: Optional identity column for the target table
+    """
+    source_df.createOrReplaceTempView("source")
+    if add_key_hash:
+        source_df = source_df.withColumn("key_hash", hash(*composite_keys))
+        composite_keys.append("key_hash")
+    if add_data_hash:
+        source_df = source_df.withColumn("data_hash", hash(*scd_columns))
+        scd_columns.append("data_hash")
+    if identity_column:
+        source_df = source_df.withColumn(identity_column, expr("uuid()"))
+    update_set = ", ".join([f"target.{col} = source.{col}" for col in scd_columns])
+    insert_columns = ", ".join(composite_keys + scd_columns)
+    insert_values = ", ".join([f"source.{col}" for col in composite_keys + scd_columns])
+    if add_data_hash:
+        merge_sql = f"""
+        MERGE INTO {target_table} target
+        USING source
+        ON target.{Constants.METADATA_KEY_HASH} = source.{Constants.METADATA_KEY_HASH}
+        WHEN MATCHED  AND (
+            target.{Constants.METADATA_DATA_HASH} != source.{Constants.METADATA_DATA_HASH}
+        )
+        THEN
+            UPDATE SET {update_set}
+        WHEN NOT MATCHED THEN
+            INSERT ({insert_columns})
+            VALUES ({insert_values})
+        """
+    else:
+        merge_sql = f"""
+        MERGE INTO {target_table} target
+        USING source
+        ON target.{Constants.METADATA_KEY_HASH} = source.{Constants.METADATA_KEY_HASH}
+        WHEN MATCHED
+        THEN
+            UPDATE SET {update_set}
+        WHEN NOT MATCHED THEN
+            INSERT ({insert_columns})
+            VALUES ({insert_values})
+        """
+    logger.info(f"Executing SCD Type 1 MERGE SQL:\n{merge_sql}")
+    spark.sql(merge_sql)
+def scd_type2(spark, target_table: str, source_df, join_keys: list,
+              scd_columns: list, business_key: str) -> None:
+    """
+    Implements SCD Type 1 using Spark MERGE INTO.
+    Args:
+        spark: SparkSession
+        target_table: Target table name
+        source_df: Source DataFrame
+        join_keys: List of join key columns
+        scd_columns: List of columns to track changes
+        business_key: Business key column name
+    """
+    source_df.createOrReplaceTempView("source")
+    join_condition = " AND ".join([f"target.{col} = source.{col}" for col in join_keys])
+    scd_updates = ", ".join([f"target.{col} = source.{col}" for col in scd_columns])
+    merge_sql = f"""
+    MERGE INTO {target_table} target
+    USING source
+    ON {join_condition}
+    WHEN MATCHED AND (
+        {" OR ".join([f"target.{col} != source.{col}" for col in scd_columns])}
+    ) THEN
+        UPDATE SET
+            is_current = false,
+            is_deleted = false,
+            end_date = current_date()
+    WHEN NOT MATCHED THEN
+        INSERT ({business_key}, {", ".join(scd_columns)}, is_current, is_deleted, start_date, end_date)
+        VALUES (source.{business_key}, {", ".join([f"source.{col}" for col in scd_columns])}, true, false, current_date(), null)
+    """
+    spark.sql(merge_sql)
+def load_file(spark: SparkSession, file_path: str, file_type: FileType) -> DataFrame:
+    """
+    Loads a file into a Spark DataFrame based on the specified file type.
+    Args:
+        spark: SparkSession
+        file_path: Path to the file
+        file_type: Type of the file (e.g., CSV, JSON, Parquet)
+    Returns:
+        DataFrame containing the loaded data
+    """
+    if file_type == FileType.CSV:
+        return spark.read.csv(file_path, header=True, inferSchema=True)
+    elif file_type == FileType.JSON:
+        return spark.read.json(file_path)
+    elif file_type == FileType.PARQUET:
+        return spark.read.parquet(file_path)
+    else:
+        raise ValueError(f"Unsupported file type: {file_type}")