sdmf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. sdmf/cli/main.py +4 -1
  2. sdmf/cli/main2.py +8 -1
  3. sdmf/data_movement_framework/BaseLoadStrategy.py +0 -4
  4. sdmf/data_movement_framework/LoadDispatcher.py +3 -1
  5. sdmf/data_movement_framework/load_types/AppendLoad.py +0 -1
  6. sdmf/data_movement_framework/load_types/FullLoad.py +0 -1
  7. sdmf/data_movement_framework/load_types/IncrementalCDC.py +0 -2
  8. sdmf/data_movement_framework/load_types/SCDType2.py +0 -1
  9. sdmf/data_movement_framework/load_types/StorageFetch.py +144 -0
  10. sdmf/data_quality/executors/ComprehensiveDQExecutor.py +2 -2
  11. sdmf/exception/BaseException.py +100 -0
  12. sdmf/exception/DataLoadException.py +8 -7
  13. sdmf/exception/{DataSpecValidationError.py → DataQualityException.py} +5 -4
  14. sdmf/exception/ExtractionException.py +3 -3
  15. sdmf/exception/ResultGenerationException.py +8 -7
  16. sdmf/exception/StorageFetchException.py +9 -0
  17. sdmf/exception/SystemError.py +2 -2
  18. sdmf/exception/ValidationError.py +3 -3
  19. sdmf-0.1.7.dist-info/METADATA +478 -0
  20. {sdmf-0.1.5.dist-info → sdmf-0.1.7.dist-info}/RECORD +22 -31
  21. sdmf/data_movement_framework/BaseExtractor.py +0 -23
  22. sdmf/exception/BasePipelineException.py +0 -59
  23. sdmf/exception/DataQualityError.py +0 -8
  24. sdmf/exception/DataSpecRuleExecutionError.py +0 -9
  25. sdmf/exception/EnvironmentPreparationError.py +0 -9
  26. sdmf/exception/FeedSpecValidationError.py +0 -9
  27. sdmf/extraction_toolkit/ExtractionController.py +0 -33
  28. sdmf/extraction_toolkit/__init__.py +0 -0
  29. sdmf/extraction_toolkit/data_class/ExtractionConfig.py +0 -9
  30. sdmf/extraction_toolkit/data_class/ExtractionResult.py +0 -19
  31. sdmf/extraction_toolkit/data_class/__init__.py +0 -0
  32. sdmf-0.1.5.dist-info/METADATA +0 -267
  33. sdmf-0.1.5.dist-info/licenses/LICENSE.txt +0 -201
  34. {sdmf-0.1.5.dist-info → sdmf-0.1.7.dist-info}/WHEEL +0 -0
  35. {sdmf-0.1.5.dist-info → sdmf-0.1.7.dist-info}/top_level.txt +0 -0
sdmf/cli/main.py CHANGED
@@ -10,7 +10,10 @@ spark = (
10
10
  .config("spark.scheduler.mode", "FAIR")
11
11
  .config(
12
12
  "spark.jars.packages",
13
- "io.delta:delta-spark_2.12:3.1.0"
13
+ ",".join([
14
+ "io.delta:delta-spark_2.12:3.1.0",
15
+ "com.databricks:spark-xml_2.12:0.17.0"
16
+ ])
14
17
  )
15
18
  .config(
16
19
  "spark.sql.extensions",
sdmf/cli/main2.py CHANGED
@@ -8,9 +8,13 @@ spark = (
8
8
  SparkSession.builder
9
9
  .appName("sdmf")
10
10
  .enableHiveSupport()
11
+ .config("spark.scheduler.mode", "FAIR")
11
12
  .config(
12
13
  "spark.jars.packages",
13
- "io.delta:delta-spark_2.12:3.1.0"
14
+ ",".join([
15
+ "io.delta:delta-spark_2.12:3.1.0",
16
+ "com.databricks:spark-xml_2.12:0.17.0"
17
+ ])
14
18
  )
15
19
  .config(
16
20
  "spark.sql.extensions",
@@ -43,3 +47,6 @@ spark = (
43
47
  # spark.sql('select count(*) from bronze.t_country_codes').show()
44
48
 
45
49
 
50
+
51
+ spark.sql('select * from bronze.t_test2').show(truncate=False)
52
+
@@ -80,7 +80,6 @@ class BaseLoadStrategy(ABC):
80
80
  except Exception as e:
81
81
  raise DataLoadException(
82
82
  message="Somethine went wrong while executing data load",
83
- load_type=self.config.master_specs["load_type"],
84
83
  original_exception=e,
85
84
  )
86
85
 
@@ -385,7 +384,6 @@ class BaseLoadStrategy(ABC):
385
384
  except Exception as e:
386
385
  raise DataLoadException(
387
386
  message=f"Error in staging layer for {self.config.feed_specs['source_table_name']}",
388
- load_type=self.config.master_specs["load_type"],
389
387
  original_exception=e,
390
388
  )
391
389
 
@@ -416,7 +414,6 @@ class BaseLoadStrategy(ABC):
416
414
  f"Attempted: '{current_type}'. "
417
415
  f"Switching load types is not permitted."
418
416
  ),
419
- load_type=self.config.master_specs["load_type"],
420
417
  original_exception=None,
421
418
  )
422
419
  else:
@@ -438,6 +435,5 @@ class BaseLoadStrategy(ABC):
438
435
  except Exception as e:
439
436
  raise DataLoadException(
440
437
  message="Something went wrong while enforcing load type consistency",
441
- load_type=self.config.master_specs["load_type"],
442
438
  original_exception=e,
443
439
  )
@@ -12,6 +12,7 @@ from sdmf.data_movement_framework.load_types.AppendLoad import AppendLoad
12
12
  from sdmf.data_movement_framework.load_types.IncrementalCDC import IncrementalCDC
13
13
  from sdmf.data_movement_framework.load_types.SCDType2 import SCDType2
14
14
  from sdmf.data_movement_framework.load_types.APIExtractor import APIExtractor
15
+ from sdmf.data_movement_framework.load_types.StorageFetch import StorageFetch
15
16
  from sdmf.data_movement_framework.data_class.LoadConfig import LoadConfig
16
17
  from sdmf.data_movement_framework.data_class.LoadResult import LoadResult
17
18
 
@@ -56,7 +57,8 @@ class LoadDispatcher():
56
57
  "SCD_TYPE_2": SCDType2,
57
58
 
58
59
  # extraction
59
- "API_EXTRACTOR": APIExtractor
60
+ "API_EXTRACTOR": APIExtractor,
61
+ "STORAGE_FETCH":StorageFetch
60
62
  }
61
63
 
62
64
  load_class = load_type_map.get(self.master_spec.get('load_type', ""))
@@ -71,7 +71,6 @@ class AppendLoad(BaseLoadStrategy):
71
71
 
72
72
  except Exception as e:
73
73
  raise DataLoadException(
74
- load_type=self.config.master_specs["load_type"],
75
74
  original_exception=e,
76
75
  message=f"Error during APPEND_LOAD for {self._current_target_table_name}: {str(e)}"
77
76
  )
@@ -81,6 +81,5 @@ class FullLoad(BaseLoadStrategy):
81
81
  except Exception as e:
82
82
  raise DataLoadException(
83
83
  message=f"Feed ID: {self.config.master_specs['feed_id']}, Error during FULL LOAD for {self._current_target_table_name}: {str(e)}",
84
- load_type=self.config.master_specs["load_type"],
85
84
  original_exception=e
86
85
  )
@@ -134,7 +134,6 @@ class IncrementalCDC(BaseLoadStrategy):
134
134
  target_df = delta_target.toDF()
135
135
  if target_df.columns != incr_df.columns:
136
136
  raise DataLoadException(
137
- load_type=self.config.feed_specs["load_type"],
138
137
  original_exception=None,
139
138
  message=f"Target table {target_table} schema [{target_df.columns}] does not match incremental data schema [{incr_df.columns}]."
140
139
  )
@@ -191,7 +190,6 @@ class IncrementalCDC(BaseLoadStrategy):
191
190
  )
192
191
  except Exception as e:
193
192
  raise DataLoadException(
194
- load_type=self.config.feed_specs["load_type"],
195
193
  original_exception=e,
196
194
  message=f"Error during Incremental CDC load for {self._current_target_table_name}: {str(e)}"
197
195
  )
@@ -162,7 +162,6 @@ class SCDType2(BaseLoadStrategy):
162
162
  )
163
163
  except Exception as e:
164
164
  raise DataLoadException(
165
- load_type=self.config.master_specs["load_type"],
166
165
  original_exception=e,
167
166
  message=f"Error during SCD_TYPE_2 for {self._current_target_table_name}: {str(e)}"
168
167
  )
@@ -0,0 +1,144 @@
1
+ # inbuilt
2
+ import os
3
+ import uuid
4
+ import time
5
+ import random
6
+ import logging
7
+ import requests
8
+ from io import BytesIO
9
+ from requests.exceptions import RequestException
10
+
11
+ # external
12
+ from pyspark.sql import SparkSession, DataFrame
13
+ from pyspark.sql.types import StructType
14
+ from pyspark.sql.functions import input_file_name
15
+
16
+ # internal
17
+ from sdmf.data_movement_framework.BaseLoadStrategy import BaseLoadStrategy
18
+ from sdmf.data_movement_framework.data_class.LoadConfig import LoadConfig
19
+ from sdmf.data_movement_framework.data_class.LoadResult import LoadResult
20
+ from sdmf.exception.StorageFetchException import StorageFetchException
21
+
22
+
23
+ class StorageFetch(BaseLoadStrategy):
24
+
25
+ def __init__(self, config: LoadConfig, spark: SparkSession) -> None:
26
+ super().__init__(config=config, spark=spark)
27
+ self.logger = logging.getLogger(__name__)
28
+ self.config = config
29
+ self.spark = spark
30
+ self.file_type = self.config.feed_specs['storage_config']['file_type']
31
+ self.lookup_directory = self.config.feed_specs['storage_config']['lookup_directory']
32
+ if self.config.target_unity_catalog == "testing":
33
+ self.__bronze_schema = f"bronze"
34
+ else:
35
+ self.__bronze_schema = f"{self.config.target_unity_catalog}.bronze"
36
+ self.logger.warning('Storage Fetch will always dump data in bronze schema as per medallion architecture.')
37
+
38
+ def load(self) -> LoadResult:
39
+ try:
40
+
41
+ results_df = self.__load_file_to_dataframe()
42
+ self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {self.__bronze_schema}")
43
+ feed_temp = (
44
+ f"{self.__bronze_schema}."
45
+ f"{self.config.master_specs['target_table_name']}"
46
+ )
47
+ self.logger.info(f"Creating bronze table: {feed_temp}")
48
+
49
+
50
+ (
51
+ results_df.write.
52
+ format("delta")
53
+ .mode("overwrite")
54
+ .saveAsTable(feed_temp)
55
+ )
56
+ return LoadResult(
57
+ feed_id = self.config.master_specs['feed_id'],
58
+ success=True,
59
+ total_rows_inserted=results_df.count(),
60
+ total_rows_updated=0,
61
+ total_rows_deleted=0
62
+ )
63
+ except Exception as e:
64
+ raise StorageFetchException(
65
+ message=f"Feed ID: {self.config.master_specs['feed_id']}, Error during FULL LOAD for {self._current_target_table_name}: {str(e)}",
66
+ original_exception=e
67
+ )
68
+
69
+ def __iterate_over_latest_medallion_directory(self, base_path) -> str:
70
+ """
71
+ Returns the maximum integer directory under base_path.
72
+ Ignores files.
73
+ """
74
+ max_dir = float('-inf')
75
+ for item in os.listdir(base_path):
76
+ if max_dir < int(item):
77
+ max_dir = int(item)
78
+ return str(max_dir)
79
+
80
+ def __load_file_to_dataframe(self) -> DataFrame:
81
+ file_path = self.__build_file_destination_directory(self.lookup_directory)
82
+ self.logger.info(f"Fetching data from path: {file_path}")
83
+
84
+ if self.file_type == 'xml':
85
+ df = (
86
+ self.spark.read
87
+ .format("xml")
88
+ .option(
89
+ "rowTag",
90
+ self.config.feed_specs['storage_config']['xml_row_tag']
91
+ )
92
+ .load(file_path)
93
+ )
94
+
95
+ elif self.file_type == 'json':
96
+ df = (
97
+ self.spark.read
98
+ .format("json")
99
+ .load(file_path)
100
+ )
101
+
102
+ elif self.file_type == 'parquet':
103
+ df = (
104
+ self.spark.read
105
+ .format("json")
106
+ .load(file_path)
107
+ )
108
+
109
+ else:
110
+ raise StorageFetchException(
111
+ "Invalid/missing value for [file_type] parameter in feed specs"
112
+ )
113
+
114
+ schema = StructType.fromJson(self.config.feed_specs['selection_schema'])
115
+ df = self._enforce_schema(df, schema)
116
+ df = df.withColumn("_x_source_file", input_file_name())
117
+
118
+ return df
119
+
120
+ def __build_file_destination_directory(self, base_path_prefix: str) -> str:
121
+ storage_type = self.config.feed_specs['storage_config']['storage_type']
122
+ is_multi_file = self.config.feed_specs['storage_config']['is_multi_file']
123
+ inside_timestamp_dir = self.config.feed_specs['storage_config']['inside_timestamp_dir']
124
+ file_name = self.config.feed_specs['storage_config']['file_name']
125
+
126
+ if storage_type == 'MEDALLION':
127
+ current_year = self.__iterate_over_latest_medallion_directory(base_path_prefix)
128
+ current_month = self.__iterate_over_latest_medallion_directory(os.path.join(base_path_prefix, current_year))
129
+ current_day = self.__iterate_over_latest_medallion_directory(os.path.join(base_path_prefix, current_year, current_month))
130
+ latest_timestamp = self.__iterate_over_latest_medallion_directory(os.path.join(base_path_prefix, current_year, current_month, current_day))
131
+ if is_multi_file == True:
132
+ return f"{base_path_prefix}/{current_year}/{current_month}/{current_day}/{latest_timestamp}/{inside_timestamp_dir}/*.{self.file_type}"
133
+ else:
134
+ return f"{base_path_prefix}/{current_year}/{current_month}/{current_day}/{latest_timestamp}/{inside_timestamp_dir}/{file_name}"
135
+
136
+ elif storage_type == 'STANDARD':
137
+ if is_multi_file == True:
138
+ return f"{base_path_prefix}/*.{self.file_type}"
139
+ else:
140
+ return f"{base_path_prefix}/{file_name}"
141
+ else:
142
+ raise StorageFetchException(
143
+ "Invalid/missing value for [storage_type] parameter in feed specs"
144
+ )
@@ -2,7 +2,7 @@
2
2
  import logging
3
3
 
4
4
  # internal
5
- from sdmf.exception.DataQualityError import DataQualityError
5
+ from sdmf.exception.DataQualityException import DataQualityException
6
6
 
7
7
 
8
8
  class ComprehensiveDQExecutor:
@@ -24,7 +24,7 @@ class ComprehensiveDQExecutor:
24
24
  dependency_ds = check.get("dependency_dataset", [])
25
25
  for dds in dependency_ds:
26
26
  if self.spark.catalog.tableExists(dds) == False:
27
- raise DataQualityError
27
+ raise DataQualityException
28
28
  query = check.get("query")
29
29
  severity = check.get("severity", "").upper()
30
30
  threshold = check.get("threshold", 0)
@@ -0,0 +1,100 @@
1
+ # inbuilt
2
+ import sys
3
+ import traceback
4
+ import logging
5
+
6
+
7
+ class BaseException(Exception):
8
+ """
9
+ Unified base exception for all SDMF pipeline errors.
10
+ Automatically logs a clean, human-readable error block.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ message=None,
16
+ details=None,
17
+ context=None,
18
+ original_exception=None,
19
+ log=True,
20
+ ):
21
+ super().__init__(message)
22
+
23
+ self.message = message or self.__class__.__name__
24
+ self.details = details
25
+ self.context = context or {}
26
+ self.original_exception = original_exception
27
+
28
+ # Capture traceback safely
29
+ exc_type, exc_value, exc_tb = sys.exc_info()
30
+ self.exc_type = exc_type.__name__ if exc_type else None
31
+ self.exc_value = str(exc_value) if exc_value else None
32
+ self.full_traceback = (
33
+ "".join(traceback.format_exception(exc_type, exc_value, exc_tb))
34
+ if exc_type
35
+ else None
36
+ )
37
+
38
+ self.logger = logging.getLogger(__name__)
39
+
40
+ # Log once, cleanly
41
+ if log:
42
+ self.logger.error(self.to_pretty_text())
43
+
44
+ # --------------------------------------------------
45
+ # Human-readable output (for logs / console)
46
+ # --------------------------------------------------
47
+ def __str__(self):
48
+ return self.to_pretty_text()
49
+
50
+ def to_pretty_text(self):
51
+ return f"""
52
+ ==================== SDMF ERROR ====================
53
+
54
+ Error Type:
55
+ {self.__class__.__name__}
56
+
57
+ Message:
58
+ {self.message}
59
+
60
+ -------------------- DETAILS --------------------
61
+ {self._format_block(self.details)}
62
+
63
+ -------------------- CONTEXT --------------------
64
+ {self._format_block(self.context)}
65
+
66
+ ------------- ORIGINAL EXCEPTION ---------------
67
+ {self._format_block(repr(self.original_exception) if self.original_exception else None)}
68
+
69
+ ------------------ STACK TRACE ------------------
70
+ {self._format_block(self.full_traceback)}
71
+
72
+ =================================================
73
+ """.strip()
74
+
75
+ # --------------------------------------------------
76
+ # Structured output (for MLflow / REST / JSON)
77
+ # --------------------------------------------------
78
+ def to_dict(self):
79
+ """Structured error payload for APIs, MLflow, or persistence."""
80
+ return {
81
+ "error_type": self.__class__.__name__,
82
+ "message": self.message,
83
+ "details": self.details,
84
+ "context": self.context,
85
+ "original_exception": repr(self.original_exception)
86
+ if self.original_exception
87
+ else None,
88
+ "exception_type": self.exc_type,
89
+ "exception_message": self.exc_value,
90
+ "traceback": self.full_traceback,
91
+ }
92
+
93
+ # --------------------------------------------------
94
+ # Helpers
95
+ # --------------------------------------------------
96
+ @staticmethod
97
+ def _format_block(value):
98
+ if value in (None, "", {}, []):
99
+ return "N/A"
100
+ return value
@@ -1,8 +1,9 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class DataLoadException(BasePipelineException):
4
- def __init__(self, message: str, load_type: str, original_exception):
5
- super().__init__(message)
6
- self.message = message
7
- self.load_type = load_type
8
- self.original_exception = original_exception
3
+ class DataLoadException(BaseException):
4
+ def __init__(self, message=None, details=None, original_exception=None):
5
+ super().__init__(
6
+ message or "Data Load Exception",
7
+ details=details,
8
+ original_exception=original_exception
9
+ )
@@ -1,9 +1,10 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class DataSpecValidationError(BasePipelineException):
3
+ class DataQualityException(BaseException):
4
4
  def __init__(self, message=None, details=None, original_exception=None):
5
5
  super().__init__(
6
- message or "Failed to Validate.",
6
+ message or "Data Quality Exception",
7
7
  details=details,
8
8
  original_exception=original_exception
9
- )
9
+ )
10
+
@@ -1,9 +1,9 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class ExtractionException(BasePipelineException):
3
+ class ExtractionException(BaseException):
4
4
  def __init__(self, message=None, details=None, original_exception=None):
5
5
  super().__init__(
6
- message or "Extraction Error",
6
+ message or "Extraction Exception",
7
7
  details=details,
8
8
  original_exception=original_exception
9
9
  )
@@ -1,8 +1,9 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class ResultGenerationException(BasePipelineException):
4
- def __init__(self, message: str,original_exception, details=None):
5
- super().__init__(message)
6
- self.message = message
7
- self.details = details
8
- self.original_exception = original_exception
3
+ class ResultGenerationException(BaseException):
4
+ def __init__(self, message=None, details=None, original_exception=None):
5
+ super().__init__(
6
+ message or "Result Generation Exception",
7
+ details=details,
8
+ original_exception=original_exception
9
+ )
@@ -0,0 +1,9 @@
1
+ from sdmf.exception.BaseException import BaseException
2
+
3
+ class StorageFetchException(BaseException):
4
+ def __init__(self, message=None, details=None, original_exception=None):
5
+ super().__init__(
6
+ message or "Storage Fetch Exception",
7
+ details=details,
8
+ original_exception=original_exception
9
+ )
@@ -1,6 +1,6 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class SystemError(BasePipelineException):
3
+ class SystemError(BaseException):
4
4
  def __init__(self, message=None, details=None, original_exception=None):
5
5
  super().__init__(
6
6
  message or "System Error",
@@ -1,9 +1,9 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class ValidationError(BasePipelineException):
3
+ class ValidationError(BaseException):
4
4
  def __init__(self, message=None, details=None, original_exception=None):
5
5
  super().__init__(
6
- message or "Extraction Error",
6
+ message or "Validation Error",
7
7
  details=details,
8
8
  original_exception=original_exception
9
9
  )