dataengineer_toolbox 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. dataengineer_toolbox-0.0.4/LICENSE +21 -0
  2. dataengineer_toolbox-0.0.4/MANIFEST.in +8 -0
  3. dataengineer_toolbox-0.0.4/PKG-INFO +100 -0
  4. dataengineer_toolbox-0.0.4/README.md +45 -0
  5. dataengineer_toolbox-0.0.4/dataeng_toolbox/__init__.py +13 -0
  6. dataengineer_toolbox-0.0.4/dataeng_toolbox/built_in/scd_test.py +0 -0
  7. dataengineer_toolbox-0.0.4/dataeng_toolbox/core/__init__.py +70 -0
  8. dataengineer_toolbox-0.0.4/dataeng_toolbox/data_loader.py +88 -0
  9. dataengineer_toolbox-0.0.4/dataeng_toolbox/entity.py +64 -0
  10. dataengineer_toolbox-0.0.4/dataeng_toolbox/model.py +110 -0
  11. dataengineer_toolbox-0.0.4/dataeng_toolbox/spark_utils.py +170 -0
  12. dataengineer_toolbox-0.0.4/dataeng_toolbox/utils.py +30 -0
  13. dataengineer_toolbox-0.0.4/dataengineer_toolbox.egg-info/PKG-INFO +100 -0
  14. dataengineer_toolbox-0.0.4/dataengineer_toolbox.egg-info/SOURCES.txt +23 -0
  15. dataengineer_toolbox-0.0.4/dataengineer_toolbox.egg-info/dependency_links.txt +1 -0
  16. dataengineer_toolbox-0.0.4/dataengineer_toolbox.egg-info/requires.txt +10 -0
  17. dataengineer_toolbox-0.0.4/dataengineer_toolbox.egg-info/top_level.txt +1 -0
  18. dataengineer_toolbox-0.0.4/examples/README.md +34 -0
  19. dataengineer_toolbox-0.0.4/examples/basic_usage.py +40 -0
  20. dataengineer_toolbox-0.0.4/examples/vtable_model_example.py +123 -0
  21. dataengineer_toolbox-0.0.4/pyproject.toml +66 -0
  22. dataengineer_toolbox-0.0.4/setup.cfg +4 -0
  23. dataengineer_toolbox-0.0.4/tests/__init__.py +1 -0
  24. dataengineer_toolbox-0.0.4/tests/test_vtable_model.py +340 -0
  25. dataengineer_toolbox-0.0.4/tests/test_vtable_serialization.py +215 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Dac Toan Ho
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,8 @@
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
4
+ recursive-include dataeng_toolbox *.py
5
+ recursive-include tests *.py
6
+ recursive-include examples *.py *.md
7
+ exclude .gitignore
8
+ exclude .github/workflows/*.yml
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataengineer_toolbox
3
+ Version: 0.0.4
4
+ Summary: A comprehensive data engineering toolbox for Python
5
+ Author-email: Your Name <your.email@example.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Dac Toan Ho
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/yourusername/dataengineer_toolbox
29
+ Project-URL: Repository, https://github.com/yourusername/dataengineer_toolbox
30
+ Project-URL: Issues, https://github.com/yourusername/dataengineer_toolbox/issues
31
+ Keywords: data engineering,toolbox,python
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: Programming Language :: Python :: 3
35
+ Classifier: Programming Language :: Python :: 3.9
36
+ Classifier: Programming Language :: Python :: 3.10
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Programming Language :: Python :: 3.12
39
+ Classifier: Programming Language :: Python :: 3.13
40
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
41
+ Classifier: Topic :: Scientific/Engineering
42
+ Requires-Python: >=3.9
43
+ Description-Content-Type: text/markdown
44
+ License-File: LICENSE
45
+ Requires-Dist: pyspark==3.5.7
46
+ Requires-Dist: pydantic>=2.0
47
+ Requires-Dist: pydantic-core>=2.0
48
+ Provides-Extra: dev
49
+ Requires-Dist: pytest>=7.0; extra == "dev"
50
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
51
+ Requires-Dist: black>=23.0; extra == "dev"
52
+ Requires-Dist: flake8>=6.0; extra == "dev"
53
+ Requires-Dist: mypy>=1.0; extra == "dev"
54
+ Dynamic: license-file
55
+
56
+ # DataEngineerToolbox
57
+ 1. Set up virtual environment
58
+ ```
59
+ python -m venv venv
60
+ venv\Scripts\activate
61
+ ```
62
+ 2. Install build dependencies
63
+
64
+ ```
65
+ pip install build
66
+ ```
67
+ 3. Install package dependencies
68
+ ```
69
+ pip install -e .
70
+ ```
71
+ 4. Build the wheel
72
+ ```
73
+ python -m build
74
+ ```
75
+ This will generate two files in the dist folder:
76
+
77
+ - dataeng_toolbox-0.1.0.tar.gz (source distribution)
78
+ - dataeng_toolbox-0.1.0-py3-none-any.whl (wheel)
79
+
80
+ 5. (Optional) Publish to PyPI
81
+
82
+ ```
83
+ pip install twine
84
+ twine upload dist/*
85
+ ```
86
+
87
+ You will be prompted for your PyPI username and password.
88
+
89
+ # Run Unit Tests
90
+ ## Run with verbose output and coverage
91
+ pytest --cov=dataeng_toolbox
92
+
93
+ ## Run specific test file
94
+ pytest tests\test_vtable_model.py
95
+
96
+ ## Run specific test class
97
+ pytest tests\test_vtable_serialization.py::TestVTableListSerialization
98
+
99
+ ## Run specific test method
100
+ pytest tests\test_vtable_serialization.py::TestVTableListSerialization::test_dumps_returns_string
@@ -0,0 +1,45 @@
1
+ # DataEngineerToolbox
2
+ 1. Set up virtual environment
3
+ ```
4
+ python -m venv venv
5
+ venv\Scripts\activate
6
+ ```
7
+ 2. Install build dependencies
8
+
9
+ ```
10
+ pip install build
11
+ ```
12
+ 3. Install package dependencies
13
+ ```
14
+ pip install -e .
15
+ ```
16
+ 4. Build the wheel
17
+ ```
18
+ python -m build
19
+ ```
20
+ This will generate two files in the dist folder:
21
+
22
+ - dataeng_toolbox-0.1.0.tar.gz (source distribution)
23
+ - dataeng_toolbox-0.1.0-py3-none-any.whl (wheel)
24
+
25
+ 5. (Optional) Publish to PyPI
26
+
27
+ ```
28
+ pip install twine
29
+ twine upload dist/*
30
+ ```
31
+
32
+ You will be prompted for your PyPI username and password.
33
+
34
+ # Run Unit Tests
35
+ ## Run with verbose output and coverage
36
+ pytest --cov=dataeng_toolbox
37
+
38
+ ## Run specific test file
39
+ pytest tests\test_vtable_model.py
40
+
41
+ ## Run specific test class
42
+ pytest tests\test_vtable_serialization.py::TestVTableListSerialization
43
+
44
+ ## Run specific test method
45
+ pytest tests\test_vtable_serialization.py::TestVTableListSerialization::test_dumps_returns_string
@@ -0,0 +1,13 @@
1
+ """
2
+ DataEng Toolbox - A comprehensive data engineering toolbox for Python.
3
+
4
+ This package provides utilities and tools commonly used in data engineering workflows.
5
+ """
6
+
7
+ __version__ = "0.1.0"
8
+ __author__ = "Your Name"
9
+ __email__ = "your.email@example.com"
10
+
11
+ from .data_loader import DataLoader
12
+
13
+ __all__ = ["DataLoader"]
@@ -0,0 +1,70 @@
1
+ # ---------------------------------------------------------------------------
2
+ # File History
3
+ # ---------------------------------------------------------------------------
4
+ # 2026-02-28 Initial creation
5
+ # ---------------------------------------------------------------------------
6
+
7
+ """
8
+ Core module for DataEng Toolbox.
9
+
10
+ This module contains the main Core class with essential functionality.
11
+ """
12
+
13
+ from typing import Dict
14
+ from unicodedata import name
15
+
16
+ from dataeng_toolbox.model import CloudProvider, PlatformType
17
+
18
+
19
+ class BasePlatform:
20
+ def __init__(self, spark, sparkutils) -> None:
21
+ self.spark = spark
22
+ self.sparkutils = sparkutils
23
+
24
+ def get_spark(self):
25
+ return self.spark
26
+
27
+ def get_sparkutils(self):
28
+ return self.sparkutils
29
+
30
+ class DatabricksPlatform(BasePlatform):
31
+ def __init__(self, spark, dbutils, cloud_provider = CloudProvider.AZURE) -> None:
32
+ super().__init__(spark, dbutils)
33
+ self.cloud_provider = cloud_provider
34
+
35
+
36
+ class FabricPlatform(BasePlatform):
37
+ def __init__(self, spark, dbutils) -> None:
38
+ super().__init__(spark, dbutils)
39
+
40
+ class Context:
41
+ def __init__(self, platform: BasePlatform, logger) -> None:
42
+ self.__platform__ = platform
43
+ self.__logger__ = logger
44
+ self.__custom_properties__ = {}
45
+
46
+ def get_platform(self) -> BasePlatform:
47
+ return self.__platform__
48
+
49
+ def get_logger(self):
50
+ return self.__logger__
51
+
52
+ def set_property(self, key: str, value):
53
+ """Set a custom property in the context."""
54
+ self.__custom_properties__[key] = value
55
+
56
+ def get_property(self, key: str):
57
+ """Get a custom property from the context."""
58
+ return self.__custom_properties__.get(key, None)
59
+
60
+ class PlatformFactory:
61
+ @staticmethod
62
+ def create_platform(platform_type: PlatformType, spark=None, dbutils=None):
63
+ """Factory method to create platform instances."""
64
+ if platform_type == PlatformType.DATABRICKS:
65
+ return DatabricksPlatform(spark, dbutils)
66
+ elif platform_type == PlatformType.FABRIC:
67
+ # Implement Fabric platform initialization here
68
+ return FabricPlatform(spark, dbutils)
69
+ else:
70
+ raise ValueError(f"Unsupported platform type: {platform_type}")
@@ -0,0 +1,88 @@
1
+ """
2
+ DataLoader module for managing data loading operations.
3
+
4
+ This module provides a singleton DataLoader class for consistent data loading
5
+ across the application.
6
+ """
7
+
8
+ from typing import Optional, Any, Dict
9
+
10
+
11
+ class DataLoader:
12
+ """
13
+ Singleton class for managing data loading operations.
14
+
15
+ This class ensures only one instance exists throughout the application lifecycle,
16
+ providing a centralized point for data loading functionality.
17
+ """
18
+
19
+ _instance: Optional['DataLoader'] = None
20
+
21
+ def __new__(cls) -> 'DataLoader':
22
+ """
23
+ Create or return the existing singleton instance.
24
+
25
+ Returns:
26
+ DataLoader: The singleton instance of DataLoader.
27
+ """
28
+ if cls._instance is None:
29
+ cls._instance = super(DataLoader, cls).__new__(cls)
30
+ cls._instance._initialized = False
31
+ return cls._instance
32
+
33
+ def __init__(self) -> None:
34
+ """Initialize the DataLoader singleton."""
35
+ if self._initialized:
36
+ return
37
+
38
+ self._initialized = True
39
+ self._cache: Dict[str, Any] = {}
40
+ self._config: Dict[str, Any] = {}
41
+
42
+ def load_data(self, source: str) -> Any:
43
+ """
44
+ Load data from a specified source.
45
+
46
+ Args:
47
+ source (str): The data source path or identifier.
48
+
49
+ Returns:
50
+ Any: The loaded data.
51
+ """
52
+ if source in self._cache:
53
+ return self._cache[source]
54
+
55
+ # TODO: Implement actual data loading logic
56
+ data = None
57
+ self._cache[source] = data
58
+ return data
59
+
60
+ def set_config(self, config: Dict[str, Any]) -> None:
61
+ """
62
+ Set configuration for the DataLoader.
63
+
64
+ Args:
65
+ config (Dict[str, Any]): Configuration dictionary.
66
+ """
67
+ self._config.update(config)
68
+
69
+ def get_config(self) -> Dict[str, Any]:
70
+ """
71
+ Get the current configuration.
72
+
73
+ Returns:
74
+ Dict[str, Any]: The current configuration dictionary.
75
+ """
76
+ return self._config.copy()
77
+
78
+ def clear_cache(self) -> None:
79
+ """Clear the data cache."""
80
+ self._cache.clear()
81
+
82
+ def reset(self) -> None:
83
+ """
84
+ Reset the singleton instance.
85
+
86
+ This is useful for testing purposes.
87
+ """
88
+ DataLoader._instance = None
@@ -0,0 +1,64 @@
1
+
2
+ from typing import Union
3
+ from pyspark.sql.types import StructType, StructField
4
+ from pyspark.sql import DataFrame
5
+ from dataeng_toolbox.model import ColumnModel
6
+ from dataeng_toolbox.model import ScdType, Context, VTableModel
7
+ from abc import ABC, abstractmethod
8
+
9
+ class BaseEntity(ABC):
10
+ """Base class for all entities."""
11
+ def __init__(self, context: Context, scd_type: ScdType) -> None:
12
+ self._scd_type = scd_type
13
+ self._context = context
14
+
15
+ def get_scd_type(self) -> ScdType:
16
+ """Get the SCD type of the entity."""
17
+ return self._scd_type
18
+
19
+ def get_context(self) -> Context:
20
+ """Get the context of the entity."""
21
+ return self._context
22
+
23
+ def get_schema(self) -> StructType | None:
24
+ """Get the schema for the entity."""
25
+ raise NotImplementedError("Subclasses must implement this method.")
26
+
27
+ @abstractmethod
28
+ def apply_transformations(self) -> DataFrame:
29
+ """Apply transformations to the DataFrame."""
30
+ raise NotImplementedError("Subclasses must implement this method.")
31
+
32
+ def apply_deletions(self) -> DataFrame:
33
+ """Apply deletions to the DataFrame."""
34
+ raise NotImplementedError("Subclasses must implement this method.")
35
+
36
+ def initalize_state(self) -> None:
37
+ """Initialize any state or dependencies for the entity."""
38
+ pass # Optional to implement in subclasses
39
+
40
+ def finalize_state(self) -> None:
41
+ """Finalize any state or dependencies for the entity."""
42
+ pass # Optional to implement in subclasses
43
+
44
+
45
+
46
+ class SilverEntity(BaseEntity):
47
+ def __init__(self, context: Context, scd_type: ScdType) -> None:
48
+ super().__init__(context, scd_type)
49
+ self._context = context
50
+
51
+
52
+ def _get_dependencies(self) -> list[VTableModel]:
53
+ """Get the list of dependency entities for the bronze entity."""
54
+ return []
55
+
56
+ def _load_dependencies(self) -> None:
57
+ """Load dependencies for the bronze entity."""
58
+ dependencies = self._get_dependencies()
59
+ for dependency in dependencies:
60
+ pass # Implement loading logic here
61
+
62
+ def get_schema(self) -> list[ColumnModel]:
63
+ """Get the schema for the silver entity."""
64
+ return []
@@ -0,0 +1,110 @@
1
+ from enum import Enum
2
+ from pyspark.sql.types import StructField
3
+ from pydantic import BaseModel, ConfigDict, field_validator, model_validator
4
+
5
+
6
+ class Constants:
7
+ METADATA_IDENTITY_KEY = "identity"
8
+ METADATA_DATA_HASH = "data_hash"
9
+ METADATA_KEY_HASH = "key_hash"
10
+
11
+
12
+ DEFAULT_SCD2_EFFECTIVE_DATE_COL = "EffectiveDate"
13
+ DEFAULT_SCD2_END_DATE_COL = "EndDate"
14
+ DEFAULT_SCD2_IS_CURRENT_COL = "IsCurrent"
15
+ DEFAULT_SCD2_CURRENT_FLAG_VALUE = True
16
+ DEFAULT_SCD2_END_DATE_FAR_FUTURE = "9999-12-31"
17
+
18
+
19
+ class ScdType(Enum):
20
+ UNDEFINED = 0
21
+ SCD0 = 1
22
+ SCD1 = 2
23
+ SCD2 = 3
24
+
25
+ class TableType(Enum):
26
+ UNDEFINED = 0
27
+ MANAGED = 1
28
+ EXTERNAL = 2
29
+
30
+ class FileType(Enum):
31
+ UNDEFINED = 0
32
+ CSV = 1
33
+ PARQUET = 2
34
+ DELTA = 3
35
+ JSON = 4
36
+
37
+
38
+ class IngestionType(Enum):
39
+ UNDEFINED = 0
40
+ FULL_LOAD = 1
41
+ INCREMENTAL = 2
42
+
43
+ class PlatformType(Enum):
44
+ UNDEFINED = 0
45
+ DATABRICKS = 1
46
+ FABRIC = 2
47
+
48
+ class CloudProvider(Enum):
49
+ UNDEFINED = 0
50
+ AWS = 1
51
+ AZURE = 2
52
+ GCP = 3
53
+
54
+ class ColumnModel(StructField):
55
+ def __init__(self, *arg, **kwargs) -> None:
56
+ super().__init__(*arg, **kwargs)
57
+
58
+ def is_identity(self) -> bool:
59
+ """Check if the column is an identity column."""
60
+ if Constants.METADATA_IDENTITY_KEY in self.metadata:
61
+ return self.metadata[Constants.METADATA_IDENTITY_KEY] is True
62
+ return False
63
+
64
+
65
+ class VFileModel(BaseModel):
66
+ """Pydantic model for representing a virtual file."""
67
+ model_config = ConfigDict(frozen=False, validate_assignment=True)
68
+ catalog: str | None = None
69
+ namespace: str | None = None
70
+ name: str
71
+ file_path: str
72
+ file_type: FileType = FileType.UNDEFINED
73
+
74
+ class VTableModel(VFileModel):
75
+ """Pydantic model for representing a virtual table."""
76
+ file_path: str | None = None
77
+ table_type: TableType = TableType.UNDEFINED
78
+
79
+ @model_validator(mode="after")
80
+ def validate_external_requires_delta(self) -> "VTableModel":
81
+ """EXTERNAL tables must use DELTA file type."""
82
+ if self.table_type == TableType.EXTERNAL and self.file_type != FileType.DELTA:
83
+ raise ValueError(
84
+ f"EXTERNAL tables must have FileType.DELTA, got {self.file_type}"
85
+ )
86
+ return self
87
+
88
+
89
+ def main() -> None:
90
+ """Simple demo entrypoint for the module.
91
+
92
+ Creates example VTableModel instances and prints their serialized forms.
93
+ """
94
+ v1 = VTableModel(catalog="main", namespace="sales", name="orders")
95
+ v2 = VTableModel(catalog="main", namespace="inventory", name="products", table_type=TableType.MANAGED)
96
+
97
+ print("Example VTableModel v1:")
98
+ print(v1)
99
+ print("model_dump:", v1.model_dump())
100
+ print("model_dump_json:", v1.model_dump_json())
101
+
102
+ print("\nExample VTableModel v2 (managed):")
103
+ print(v2)
104
+ print("model_dump:", v2.model_dump())
105
+ print("model_dump_json:", v2.model_dump_json())
106
+
107
+
108
+ if __name__ == "__main__":
109
+ main()
110
+
@@ -0,0 +1,170 @@
1
+ from pyspark.sql import SparkSession, DataFrame
2
+ from pyspark.sql.functions import expr
3
+
4
+ from dataeng_toolbox.model import Constants, FileType
5
+ from dataeng_toolbox.utils import get_logger
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ def scd_type1(spark: SparkSession, target_table: str, source_df: DataFrame,
11
+ composite_keys: list, scd_columns: list) -> None:
12
+ """
13
+ Implements SCD Type 1 using Spark MERGE INTO.
14
+ Updates existing records with new values, inserts new records.
15
+
16
+ Args:
17
+ spark: SparkSession
18
+ target_table: Target table name
19
+ source_df: Source Spark DataFrame
20
+ composite_keys: List of composite key columns for matching
21
+ scd_columns: List of columns to track changes
22
+ """
23
+ source_df.createOrReplaceTempView("source")
24
+
25
+ join_condition = " AND ".join([f"target.{col} = source.{col}" for col in composite_keys])
26
+
27
+ update_set = ", ".join([f"target.{col} = source.{col}" for col in scd_columns])
28
+
29
+ insert_columns = ", ".join(composite_keys + scd_columns)
30
+ insert_values = ", ".join([f"source.{col}" for col in composite_keys + scd_columns])
31
+
32
+ merge_sql = f"""
33
+ MERGE INTO {target_table} target
34
+ USING source
35
+ ON {join_condition}
36
+ WHEN MATCHED THEN
37
+ UPDATE SET {update_set}
38
+ WHEN NOT MATCHED THEN
39
+ INSERT ({insert_columns})
40
+ VALUES ({insert_values})
41
+ """
42
+
43
+ logger.info(f"Executing SCD Type 1 MERGE SQL:\n{merge_sql}")
44
+ spark.sql(merge_sql)
45
+
46
+
47
+ def scd_type1_with_hash(spark: SparkSession, target_table: str, source_df: DataFrame,
48
+ composite_keys: list, scd_columns: list, add_key_hash: bool = False,
49
+ add_data_hash: bool = False, identity_column: str = None) -> None:
50
+ """
51
+ Implements SCD Type 1 using Spark MERGE INTO.
52
+ Updates existing records with new values, inserts new records.
53
+
54
+ Args:
55
+ spark: SparkSession
56
+ target_table: Target table name
57
+ source_df: Source Spark DataFrame
58
+ composite_keys: List of composite key columns for matching
59
+ scd_columns: List of columns to track changes
60
+ add_key_hash: Whether to add a hash column for the composite key
61
+ add_data_hash: Whether to add a hash column for the SCD columns
62
+ identity_column: Optional identity column for the target table
63
+ """
64
+ source_df.createOrReplaceTempView("source")
65
+
66
+ if add_key_hash:
67
+ source_df = source_df.withColumn("key_hash", hash(*composite_keys))
68
+ composite_keys.append("key_hash")
69
+
70
+ if add_data_hash:
71
+ source_df = source_df.withColumn("data_hash", hash(*scd_columns))
72
+ scd_columns.append("data_hash")
73
+
74
+ if identity_column:
75
+ source_df = source_df.withColumn(identity_column, expr("uuid()"))
76
+
77
+ update_set = ", ".join([f"target.{col} = source.{col}" for col in scd_columns])
78
+
79
+ insert_columns = ", ".join(composite_keys + scd_columns)
80
+ insert_values = ", ".join([f"source.{col}" for col in composite_keys + scd_columns])
81
+
82
+ if add_data_hash:
83
+ merge_sql = f"""
84
+ MERGE INTO {target_table} target
85
+ USING source
86
+ ON target.{Constants.METADATA_KEY_HASH} = source.{Constants.METADATA_KEY_HASH}
87
+ WHEN MATCHED AND (
88
+ target.{Constants.METADATA_DATA_HASH} != source.{Constants.METADATA_DATA_HASH}
89
+ )
90
+ THEN
91
+ UPDATE SET {update_set}
92
+ WHEN NOT MATCHED THEN
93
+ INSERT ({insert_columns})
94
+ VALUES ({insert_values})
95
+ """
96
+ else:
97
+ merge_sql = f"""
98
+ MERGE INTO {target_table} target
99
+ USING source
100
+ ON target.{Constants.METADATA_KEY_HASH} = source.{Constants.METADATA_KEY_HASH}
101
+ WHEN MATCHED
102
+ THEN
103
+ UPDATE SET {update_set}
104
+ WHEN NOT MATCHED THEN
105
+ INSERT ({insert_columns})
106
+ VALUES ({insert_values})
107
+ """
108
+
109
+ logger.info(f"Executing SCD Type 1 MERGE SQL:\n{merge_sql}")
110
+ spark.sql(merge_sql)
111
+
112
+
113
+ def scd_type2(spark, target_table: str, source_df, join_keys: list,
114
+ scd_columns: list, business_key: str) -> None:
115
+ """
116
+ Implements SCD Type 1 using Spark MERGE INTO.
117
+
118
+ Args:
119
+ spark: SparkSession
120
+ target_table: Target table name
121
+ source_df: Source DataFrame
122
+ join_keys: List of join key columns
123
+ scd_columns: List of columns to track changes
124
+ business_key: Business key column name
125
+ """
126
+ source_df.createOrReplaceTempView("source")
127
+
128
+ join_condition = " AND ".join([f"target.{col} = source.{col}" for col in join_keys])
129
+
130
+ scd_updates = ", ".join([f"target.{col} = source.{col}" for col in scd_columns])
131
+
132
+ merge_sql = f"""
133
+ MERGE INTO {target_table} target
134
+ USING source
135
+ ON {join_condition}
136
+ WHEN MATCHED AND (
137
+ {" OR ".join([f"target.{col} != source.{col}" for col in scd_columns])}
138
+ ) THEN
139
+ UPDATE SET
140
+ is_current = false,
141
+ is_deleted = false,
142
+ end_date = current_date()
143
+ WHEN NOT MATCHED THEN
144
+ INSERT ({business_key}, {", ".join(scd_columns)}, is_current, is_deleted, start_date, end_date)
145
+ VALUES (source.{business_key}, {", ".join([f"source.{col}" for col in scd_columns])}, true, false, current_date(), null)
146
+ """
147
+
148
+ spark.sql(merge_sql)
149
+
150
+
151
+ def load_file(spark: SparkSession, file_path: str, file_type: FileType) -> DataFrame:
152
+ """
153
+ Loads a file into a Spark DataFrame based on the specified file type.
154
+
155
+ Args:
156
+ spark: SparkSession
157
+ file_path: Path to the file
158
+ file_type: Type of the file (e.g., CSV, JSON, Parquet)
159
+
160
+ Returns:
161
+ DataFrame containing the loaded data
162
+ """
163
+ if file_type == FileType.CSV:
164
+ return spark.read.csv(file_path, header=True, inferSchema=True)
165
+ elif file_type == FileType.JSON:
166
+ return spark.read.json(file_path)
167
+ elif file_type == FileType.PARQUET:
168
+ return spark.read.parquet(file_path)
169
+ else:
170
+ raise ValueError(f"Unsupported file type: {file_type}")