sdmf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. sdmf/cli/main.py +4 -1
  2. sdmf/cli/main2.py +8 -1
  3. sdmf/data_movement_framework/BaseLoadStrategy.py +0 -4
  4. sdmf/data_movement_framework/LoadDispatcher.py +3 -1
  5. sdmf/data_movement_framework/load_types/AppendLoad.py +0 -1
  6. sdmf/data_movement_framework/load_types/FullLoad.py +0 -1
  7. sdmf/data_movement_framework/load_types/IncrementalCDC.py +0 -2
  8. sdmf/data_movement_framework/load_types/SCDType2.py +0 -1
  9. sdmf/data_movement_framework/load_types/StorageFetch.py +144 -0
  10. sdmf/data_quality/executors/ComprehensiveDQExecutor.py +2 -2
  11. sdmf/exception/BaseException.py +100 -0
  12. sdmf/exception/DataLoadException.py +8 -7
  13. sdmf/exception/{DataSpecValidationError.py → DataQualityException.py} +5 -4
  14. sdmf/exception/ExtractionException.py +3 -3
  15. sdmf/exception/ResultGenerationException.py +8 -7
  16. sdmf/exception/StorageFetchException.py +9 -0
  17. sdmf/exception/SystemError.py +2 -2
  18. sdmf/exception/ValidationError.py +3 -3
  19. {sdmf-0.1.6.dist-info → sdmf-0.1.8.dist-info}/METADATA +9 -6
  20. {sdmf-0.1.6.dist-info → sdmf-0.1.8.dist-info}/RECORD +22 -30
  21. sdmf/data_movement_framework/BaseExtractor.py +0 -23
  22. sdmf/exception/BasePipelineException.py +0 -59
  23. sdmf/exception/DataQualityError.py +0 -8
  24. sdmf/exception/DataSpecRuleExecutionError.py +0 -9
  25. sdmf/exception/EnvironmentPreparationError.py +0 -9
  26. sdmf/exception/FeedSpecValidationError.py +0 -9
  27. sdmf/extraction_toolkit/ExtractionController.py +0 -33
  28. sdmf/extraction_toolkit/__init__.py +0 -0
  29. sdmf/extraction_toolkit/data_class/ExtractionConfig.py +0 -9
  30. sdmf/extraction_toolkit/data_class/ExtractionResult.py +0 -19
  31. sdmf/extraction_toolkit/data_class/__init__.py +0 -0
  32. {sdmf-0.1.6.dist-info → sdmf-0.1.8.dist-info}/WHEEL +0 -0
  33. {sdmf-0.1.6.dist-info → sdmf-0.1.8.dist-info}/top_level.txt +0 -0
sdmf/cli/main.py CHANGED
@@ -10,7 +10,10 @@ spark = (
10
10
  .config("spark.scheduler.mode", "FAIR")
11
11
  .config(
12
12
  "spark.jars.packages",
13
- "io.delta:delta-spark_2.12:3.1.0"
13
+ ",".join([
14
+ "io.delta:delta-spark_2.12:3.1.0",
15
+ "com.databricks:spark-xml_2.12:0.17.0"
16
+ ])
14
17
  )
15
18
  .config(
16
19
  "spark.sql.extensions",
sdmf/cli/main2.py CHANGED
@@ -8,9 +8,13 @@ spark = (
8
8
  SparkSession.builder
9
9
  .appName("sdmf")
10
10
  .enableHiveSupport()
11
+ .config("spark.scheduler.mode", "FAIR")
11
12
  .config(
12
13
  "spark.jars.packages",
13
- "io.delta:delta-spark_2.12:3.1.0"
14
+ ",".join([
15
+ "io.delta:delta-spark_2.12:3.1.0",
16
+ "com.databricks:spark-xml_2.12:0.17.0"
17
+ ])
14
18
  )
15
19
  .config(
16
20
  "spark.sql.extensions",
@@ -43,3 +47,6 @@ spark = (
43
47
  # spark.sql('select count(*) from bronze.t_country_codes').show()
44
48
 
45
49
 
50
+
51
+ spark.sql('select * from bronze.t_test2').show(truncate=False)
52
+
@@ -80,7 +80,6 @@ class BaseLoadStrategy(ABC):
80
80
  except Exception as e:
81
81
  raise DataLoadException(
82
82
  message="Somethine went wrong while executing data load",
83
- load_type=self.config.master_specs["load_type"],
84
83
  original_exception=e,
85
84
  )
86
85
 
@@ -385,7 +384,6 @@ class BaseLoadStrategy(ABC):
385
384
  except Exception as e:
386
385
  raise DataLoadException(
387
386
  message=f"Error in staging layer for {self.config.feed_specs['source_table_name']}",
388
- load_type=self.config.master_specs["load_type"],
389
387
  original_exception=e,
390
388
  )
391
389
 
@@ -416,7 +414,6 @@ class BaseLoadStrategy(ABC):
416
414
  f"Attempted: '{current_type}'. "
417
415
  f"Switching load types is not permitted."
418
416
  ),
419
- load_type=self.config.master_specs["load_type"],
420
417
  original_exception=None,
421
418
  )
422
419
  else:
@@ -438,6 +435,5 @@ class BaseLoadStrategy(ABC):
438
435
  except Exception as e:
439
436
  raise DataLoadException(
440
437
  message="Something went wrong while enforcing load type consistency",
441
- load_type=self.config.master_specs["load_type"],
442
438
  original_exception=e,
443
439
  )
@@ -12,6 +12,7 @@ from sdmf.data_movement_framework.load_types.AppendLoad import AppendLoad
12
12
  from sdmf.data_movement_framework.load_types.IncrementalCDC import IncrementalCDC
13
13
  from sdmf.data_movement_framework.load_types.SCDType2 import SCDType2
14
14
  from sdmf.data_movement_framework.load_types.APIExtractor import APIExtractor
15
+ from sdmf.data_movement_framework.load_types.StorageFetch import StorageFetch
15
16
  from sdmf.data_movement_framework.data_class.LoadConfig import LoadConfig
16
17
  from sdmf.data_movement_framework.data_class.LoadResult import LoadResult
17
18
 
@@ -56,7 +57,8 @@ class LoadDispatcher():
56
57
  "SCD_TYPE_2": SCDType2,
57
58
 
58
59
  # extraction
59
- "API_EXTRACTOR": APIExtractor
60
+ "API_EXTRACTOR": APIExtractor,
61
+ "STORAGE_FETCH":StorageFetch
60
62
  }
61
63
 
62
64
  load_class = load_type_map.get(self.master_spec.get('load_type', ""))
@@ -71,7 +71,6 @@ class AppendLoad(BaseLoadStrategy):
71
71
 
72
72
  except Exception as e:
73
73
  raise DataLoadException(
74
- load_type=self.config.master_specs["load_type"],
75
74
  original_exception=e,
76
75
  message=f"Error during APPEND_LOAD for {self._current_target_table_name}: {str(e)}"
77
76
  )
@@ -81,6 +81,5 @@ class FullLoad(BaseLoadStrategy):
81
81
  except Exception as e:
82
82
  raise DataLoadException(
83
83
  message=f"Feed ID: {self.config.master_specs['feed_id']}, Error during FULL LOAD for {self._current_target_table_name}: {str(e)}",
84
- load_type=self.config.master_specs["load_type"],
85
84
  original_exception=e
86
85
  )
@@ -134,7 +134,6 @@ class IncrementalCDC(BaseLoadStrategy):
134
134
  target_df = delta_target.toDF()
135
135
  if target_df.columns != incr_df.columns:
136
136
  raise DataLoadException(
137
- load_type=self.config.feed_specs["load_type"],
138
137
  original_exception=None,
139
138
  message=f"Target table {target_table} schema [{target_df.columns}] does not match incremental data schema [{incr_df.columns}]."
140
139
  )
@@ -191,7 +190,6 @@ class IncrementalCDC(BaseLoadStrategy):
191
190
  )
192
191
  except Exception as e:
193
192
  raise DataLoadException(
194
- load_type=self.config.feed_specs["load_type"],
195
193
  original_exception=e,
196
194
  message=f"Error during Incremental CDC load for {self._current_target_table_name}: {str(e)}"
197
195
  )
@@ -162,7 +162,6 @@ class SCDType2(BaseLoadStrategy):
162
162
  )
163
163
  except Exception as e:
164
164
  raise DataLoadException(
165
- load_type=self.config.master_specs["load_type"],
166
165
  original_exception=e,
167
166
  message=f"Error during SCD_TYPE_2 for {self._current_target_table_name}: {str(e)}"
168
167
  )
@@ -0,0 +1,144 @@
1
+ # inbuilt
2
+ import os
3
+ import uuid
4
+ import time
5
+ import random
6
+ import logging
7
+ import requests
8
+ from io import BytesIO
9
+ from requests.exceptions import RequestException
10
+
11
+ # external
12
+ from pyspark.sql import SparkSession, DataFrame
13
+ from pyspark.sql.types import StructType
14
+ from pyspark.sql.functions import input_file_name
15
+
16
+ # internal
17
+ from sdmf.data_movement_framework.BaseLoadStrategy import BaseLoadStrategy
18
+ from sdmf.data_movement_framework.data_class.LoadConfig import LoadConfig
19
+ from sdmf.data_movement_framework.data_class.LoadResult import LoadResult
20
+ from sdmf.exception.StorageFetchException import StorageFetchException
21
+
22
+
23
+ class StorageFetch(BaseLoadStrategy):
24
+
25
+ def __init__(self, config: LoadConfig, spark: SparkSession) -> None:
26
+ super().__init__(config=config, spark=spark)
27
+ self.logger = logging.getLogger(__name__)
28
+ self.config = config
29
+ self.spark = spark
30
+ self.file_type = self.config.feed_specs['storage_config']['file_type']
31
+ self.lookup_directory = self.config.feed_specs['storage_config']['lookup_directory']
32
+ if self.config.target_unity_catalog == "testing":
33
+ self.__bronze_schema = f"bronze"
34
+ else:
35
+ self.__bronze_schema = f"{self.config.target_unity_catalog}.bronze"
36
+ self.logger.warning('Storage Fetch will always dump data in bronze schema as per medallion architecture.')
37
+
38
+ def load(self) -> LoadResult:
39
+ try:
40
+
41
+ results_df = self.__load_file_to_dataframe()
42
+ self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {self.__bronze_schema}")
43
+ feed_temp = (
44
+ f"{self.__bronze_schema}."
45
+ f"{self.config.master_specs['target_table_name']}"
46
+ )
47
+ self.logger.info(f"Creating bronze table: {feed_temp}")
48
+
49
+
50
+ (
51
+ results_df.write.
52
+ format("delta")
53
+ .mode("overwrite")
54
+ .saveAsTable(feed_temp)
55
+ )
56
+ return LoadResult(
57
+ feed_id = self.config.master_specs['feed_id'],
58
+ success=True,
59
+ total_rows_inserted=results_df.count(),
60
+ total_rows_updated=0,
61
+ total_rows_deleted=0
62
+ )
63
+ except Exception as e:
64
+ raise StorageFetchException(
65
+ message=f"Feed ID: {self.config.master_specs['feed_id']}, Error during FULL LOAD for {self._current_target_table_name}: {str(e)}",
66
+ original_exception=e
67
+ )
68
+
69
+ def __iterate_over_latest_medallion_directory(self, base_path) -> str:
70
+ """
71
+ Returns the maximum integer directory under base_path.
72
+ Ignores files.
73
+ """
74
+ max_dir = float('-inf')
75
+ for item in os.listdir(base_path):
76
+ if max_dir < int(item):
77
+ max_dir = int(item)
78
+ return str(max_dir)
79
+
80
+ def __load_file_to_dataframe(self) -> DataFrame:
81
+ file_path = self.__build_file_destination_directory(self.lookup_directory)
82
+ self.logger.info(f"Fetching data from path: {file_path}")
83
+
84
+ if self.file_type == 'xml':
85
+ df = (
86
+ self.spark.read
87
+ .format("xml")
88
+ .option(
89
+ "rowTag",
90
+ self.config.feed_specs['storage_config']['xml_row_tag']
91
+ )
92
+ .load(file_path)
93
+ )
94
+
95
+ elif self.file_type == 'json':
96
+ df = (
97
+ self.spark.read
98
+ .format("json")
99
+ .load(file_path)
100
+ )
101
+
102
+ elif self.file_type == 'parquet':
103
+ df = (
104
+ self.spark.read
105
+ .format("json")
106
+ .load(file_path)
107
+ )
108
+
109
+ else:
110
+ raise StorageFetchException(
111
+ "Invalid/missing value for [file_type] parameter in feed specs"
112
+ )
113
+
114
+ schema = StructType.fromJson(self.config.feed_specs['selection_schema'])
115
+ df = self._enforce_schema(df, schema)
116
+ df = df.withColumn("_x_source_file", input_file_name())
117
+
118
+ return df
119
+
120
+ def __build_file_destination_directory(self, base_path_prefix: str) -> str:
121
+ storage_type = self.config.feed_specs['storage_config']['storage_type']
122
+ is_multi_file = self.config.feed_specs['storage_config']['is_multi_file']
123
+ inside_timestamp_dir = self.config.feed_specs['storage_config']['inside_timestamp_dir']
124
+ file_name = self.config.feed_specs['storage_config']['file_name']
125
+
126
+ if storage_type == 'MEDALLION':
127
+ current_year = self.__iterate_over_latest_medallion_directory(base_path_prefix)
128
+ current_month = self.__iterate_over_latest_medallion_directory(os.path.join(base_path_prefix, current_year))
129
+ current_day = self.__iterate_over_latest_medallion_directory(os.path.join(base_path_prefix, current_year, current_month))
130
+ latest_timestamp = self.__iterate_over_latest_medallion_directory(os.path.join(base_path_prefix, current_year, current_month, current_day))
131
+ if is_multi_file == True:
132
+ return f"{base_path_prefix}/{current_year}/{current_month}/{current_day}/{latest_timestamp}/{inside_timestamp_dir}/*.{self.file_type}"
133
+ else:
134
+ return f"{base_path_prefix}/{current_year}/{current_month}/{current_day}/{latest_timestamp}/{inside_timestamp_dir}/{file_name}"
135
+
136
+ elif storage_type == 'STANDARD':
137
+ if is_multi_file == True:
138
+ return f"{base_path_prefix}/*.{self.file_type}"
139
+ else:
140
+ return f"{base_path_prefix}/{file_name}"
141
+ else:
142
+ raise StorageFetchException(
143
+ "Invalid/missing value for [storage_type] parameter in feed specs"
144
+ )
@@ -2,7 +2,7 @@
2
2
  import logging
3
3
 
4
4
  # internal
5
- from sdmf.exception.DataQualityError import DataQualityError
5
+ from sdmf.exception.DataQualityException import DataQualityException
6
6
 
7
7
 
8
8
  class ComprehensiveDQExecutor:
@@ -24,7 +24,7 @@ class ComprehensiveDQExecutor:
24
24
  dependency_ds = check.get("dependency_dataset", [])
25
25
  for dds in dependency_ds:
26
26
  if self.spark.catalog.tableExists(dds) == False:
27
- raise DataQualityError
27
+ raise DataQualityException
28
28
  query = check.get("query")
29
29
  severity = check.get("severity", "").upper()
30
30
  threshold = check.get("threshold", 0)
@@ -0,0 +1,100 @@
1
+ # inbuilt
2
+ import sys
3
+ import traceback
4
+ import logging
5
+
6
+
7
+ class BaseException(Exception):
8
+ """
9
+ Unified base exception for all SDMF pipeline errors.
10
+ Automatically logs a clean, human-readable error block.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ message=None,
16
+ details=None,
17
+ context=None,
18
+ original_exception=None,
19
+ log=True,
20
+ ):
21
+ super().__init__(message)
22
+
23
+ self.message = message or self.__class__.__name__
24
+ self.details = details
25
+ self.context = context or {}
26
+ self.original_exception = original_exception
27
+
28
+ # Capture traceback safely
29
+ exc_type, exc_value, exc_tb = sys.exc_info()
30
+ self.exc_type = exc_type.__name__ if exc_type else None
31
+ self.exc_value = str(exc_value) if exc_value else None
32
+ self.full_traceback = (
33
+ "".join(traceback.format_exception(exc_type, exc_value, exc_tb))
34
+ if exc_type
35
+ else None
36
+ )
37
+
38
+ self.logger = logging.getLogger(__name__)
39
+
40
+ # Log once, cleanly
41
+ if log:
42
+ self.logger.error(self.to_pretty_text())
43
+
44
+ # --------------------------------------------------
45
+ # Human-readable output (for logs / console)
46
+ # --------------------------------------------------
47
+ def __str__(self):
48
+ return self.to_pretty_text()
49
+
50
+ def to_pretty_text(self):
51
+ return f"""
52
+ ==================== SDMF ERROR ====================
53
+
54
+ Error Type:
55
+ {self.__class__.__name__}
56
+
57
+ Message:
58
+ {self.message}
59
+
60
+ -------------------- DETAILS --------------------
61
+ {self._format_block(self.details)}
62
+
63
+ -------------------- CONTEXT --------------------
64
+ {self._format_block(self.context)}
65
+
66
+ ------------- ORIGINAL EXCEPTION ---------------
67
+ {self._format_block(repr(self.original_exception) if self.original_exception else None)}
68
+
69
+ ------------------ STACK TRACE ------------------
70
+ {self._format_block(self.full_traceback)}
71
+
72
+ =================================================
73
+ """.strip()
74
+
75
+ # --------------------------------------------------
76
+ # Structured output (for MLflow / REST / JSON)
77
+ # --------------------------------------------------
78
+ def to_dict(self):
79
+ """Structured error payload for APIs, MLflow, or persistence."""
80
+ return {
81
+ "error_type": self.__class__.__name__,
82
+ "message": self.message,
83
+ "details": self.details,
84
+ "context": self.context,
85
+ "original_exception": repr(self.original_exception)
86
+ if self.original_exception
87
+ else None,
88
+ "exception_type": self.exc_type,
89
+ "exception_message": self.exc_value,
90
+ "traceback": self.full_traceback,
91
+ }
92
+
93
+ # --------------------------------------------------
94
+ # Helpers
95
+ # --------------------------------------------------
96
+ @staticmethod
97
+ def _format_block(value):
98
+ if value in (None, "", {}, []):
99
+ return "N/A"
100
+ return value
@@ -1,8 +1,9 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class DataLoadException(BasePipelineException):
4
- def __init__(self, message: str, load_type: str, original_exception):
5
- super().__init__(message)
6
- self.message = message
7
- self.load_type = load_type
8
- self.original_exception = original_exception
3
+ class DataLoadException(BaseException):
4
+ def __init__(self, message=None, details=None, original_exception=None):
5
+ super().__init__(
6
+ message or "Data Load Exception",
7
+ details=details,
8
+ original_exception=original_exception
9
+ )
@@ -1,9 +1,10 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class DataSpecValidationError(BasePipelineException):
3
+ class DataQualityException(BaseException):
4
4
  def __init__(self, message=None, details=None, original_exception=None):
5
5
  super().__init__(
6
- message or "Failed to Validate.",
6
+ message or "Data Quality Exception",
7
7
  details=details,
8
8
  original_exception=original_exception
9
- )
9
+ )
10
+
@@ -1,9 +1,9 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class ExtractionException(BasePipelineException):
3
+ class ExtractionException(BaseException):
4
4
  def __init__(self, message=None, details=None, original_exception=None):
5
5
  super().__init__(
6
- message or "Extraction Error",
6
+ message or "Extraction Exception",
7
7
  details=details,
8
8
  original_exception=original_exception
9
9
  )
@@ -1,8 +1,9 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class ResultGenerationException(BasePipelineException):
4
- def __init__(self, message: str,original_exception, details=None):
5
- super().__init__(message)
6
- self.message = message
7
- self.details = details
8
- self.original_exception = original_exception
3
+ class ResultGenerationException(BaseException):
4
+ def __init__(self, message=None, details=None, original_exception=None):
5
+ super().__init__(
6
+ message or "Result Generation Exception",
7
+ details=details,
8
+ original_exception=original_exception
9
+ )
@@ -0,0 +1,9 @@
1
+ from sdmf.exception.BaseException import BaseException
2
+
3
+ class StorageFetchException(BaseException):
4
+ def __init__(self, message=None, details=None, original_exception=None):
5
+ super().__init__(
6
+ message or "Storage Fetch Exception",
7
+ details=details,
8
+ original_exception=original_exception
9
+ )
@@ -1,6 +1,6 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class SystemError(BasePipelineException):
3
+ class SystemError(BaseException):
4
4
  def __init__(self, message=None, details=None, original_exception=None):
5
5
  super().__init__(
6
6
  message or "System Error",
@@ -1,9 +1,9 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
1
+ from sdmf.exception.BaseException import BaseException
2
2
 
3
- class ValidationError(BasePipelineException):
3
+ class ValidationError(BaseException):
4
4
  def __init__(self, message=None, details=None, original_exception=None):
5
5
  super().__init__(
6
- message or "Extraction Error",
6
+ message or "Validation Error",
7
7
  details=details,
8
8
  original_exception=original_exception
9
9
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdmf
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: SDMF - Standard Data Management Framework
5
5
  Author: Harsh Hando
6
6
  Author-email: Harsh Handoo <handoo.harsh@gmail.com>
@@ -206,12 +206,13 @@ License: Apache License
206
206
  See the License for the specific language governing permissions and
207
207
  limitations under the License.
208
208
 
209
- Project-URL: License, https://github.com/yalsworldofficial/standard-data-management-framework/blob/main/LICENSE
209
+ Project-URL: Homepage, https://github.com/hhandoo/sdmf-official
210
+ Project-URL: Repository, https://github.com/hhandoo/sdmf-official
211
+ Project-URL: Documentation, https://github.com/hhandoo/sdmf-official#readme
212
+ Project-URL: Changelog, https://github.com/hhandoo/sdmf-official/blob/main/CHANGELOG.md
213
+ Project-URL: Issues, https://github.com/hhandoo/sdmf-official/issues
214
+ Project-URL: License, https://github.com/hhandoo/sdmf-official/blob/main/LICENSE
210
215
  Classifier: License :: OSI Approved :: Apache Software License
211
- Classifier: Programming Language :: Python :: 3
212
- Classifier: Programming Language :: Python :: 3.9
213
- Classifier: Programming Language :: Python :: 3.10
214
- Classifier: Programming Language :: Python :: 3.11
215
216
  Classifier: Programming Language :: Python :: 3.12
216
217
  Classifier: Operating System :: OS Independent
217
218
  Classifier: Development Status :: 3 - Alpha
@@ -220,6 +221,8 @@ Description-Content-Type: text/markdown
220
221
  Requires-Dist: pandas
221
222
  Requires-Dist: openpyxl
222
223
  Requires-Dist: matplotlib
224
+ Requires-Dist: delta-spark
225
+ Requires-Dist: numpy
223
226
 
224
227
  # Standard Data Management Framework (SDMF)
225
228
 
@@ -1,28 +1,28 @@
1
1
  sdmf/__init__.py,sha256=dsAMhbmYMsgekduahb91sqYr9I2hag3Ezhlzsjo3k5g,117
2
2
  sdmf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- sdmf/cli/main.py,sha256=5UTZYQRMTwwZDiwyiu-oYs8bTletcn3KP9560wHOTMk,3400
4
- sdmf/cli/main2.py,sha256=jqt6cB-r5tNkBnLxKm0UG70plyS7-BWDVDb7VohynQ0,1409
3
+ sdmf/cli/main.py,sha256=FiUoC8Zbr2oemaQKlHglJZjVNLrgDUGqCTa2TBmN_Go,3486
4
+ sdmf/cli/main2.py,sha256=2RQ1YZvaTPMwEWz7ysF73uP-lCzF6wITjVds5gXa3w8,1604
5
5
  sdmf/cli/tt.py,sha256=ocxNSD93-bGWUk25cBJC8Zl3-mHCnvp45R3QLnwtNkI,128
6
6
  sdmf/config/LoggingConfig.py,sha256=-HeVRB12DNh5Lv8RTCxAY_jCDH-EKbcS2xV0tCgksLg,2984
7
7
  sdmf/config/LoggingPrettyFormatter.py,sha256=3UJBwmI_szxct1auy_YX9cM6qHD9EW1D_aSxlpJn6K8,1045
8
8
  sdmf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  sdmf/data_flow_diagram_generator/DataFlowDiagramGenerator.py,sha256=nerO6bH__g80VeqNQArFwEpdwJC9X1isLU3Q6hsAs6A,5852
10
- sdmf/data_movement_framework/BaseExtractor.py,sha256=1qmty27E5WPeHWJp3_yVJSvFAu7mrniKA4YZWmanIhU,667
11
- sdmf/data_movement_framework/BaseLoadStrategy.py,sha256=FQAMu60iUxoQRY9MM-sNK9jbEPjjBrbIIEn5xm9NFvc,19421
10
+ sdmf/data_movement_framework/BaseLoadStrategy.py,sha256=0JTwnckFKtYwhWxIzQJ4N1XQ07yAyG70-21SUSZfSfk,19149
12
11
  sdmf/data_movement_framework/DataLoadController.py,sha256=e8NtvsK4gXQniym4DhjVSX9RT6NmF4klyaiXv2aYKx0,2797
13
- sdmf/data_movement_framework/LoadDispatcher.py,sha256=VZGJIpHMXlO7gi0g5gPr9o-Fhb3-m292wQkLGkLv-nw,3726
12
+ sdmf/data_movement_framework/LoadDispatcher.py,sha256=UNbPnOXgc58nVWY-_dWKKInhxkgt11KQpyIvSKOc5GI,3846
14
13
  sdmf/data_movement_framework/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
14
  sdmf/data_movement_framework/data_class/LoadConfig.py,sha256=Jn0Un-Am-iJegtNpWBfo9NkXQRfErCf-EUzJA4oTe_A,262
16
15
  sdmf/data_movement_framework/data_class/LoadResult.py,sha256=XX5CUW50RS4n3igI3P6s6U2Oa4eGF66g_Zzh1cr1XSM,558
17
16
  sdmf/data_movement_framework/data_class/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
17
  sdmf/data_movement_framework/load_types/APIExtractor.py,sha256=g1dLjDtI27qdzyfSGrjtIC154TngInrbjZP1yyKD1DI,6927
19
- sdmf/data_movement_framework/load_types/AppendLoad.py,sha256=o3EPUHNNGfkWynKSaDyvG1yAGhdl7FghMK6d9nTw6kE,3325
20
- sdmf/data_movement_framework/load_types/FullLoad.py,sha256=roS6fjB8vcYf-7lsnqGuZHJi_QF5QUJ80qW-_m2pXPY,3686
21
- sdmf/data_movement_framework/load_types/IncrementalCDC.py,sha256=BQU8qUPWVQXP5XdEj7nzemeygQ4gMb1nIla7KX9e75A,8554
22
- sdmf/data_movement_framework/load_types/SCDType2.py,sha256=5gv8tAyE36-fXZFSN4lFg_5_ASgELPlcc6DiTWwXrqU,7771
18
+ sdmf/data_movement_framework/load_types/AppendLoad.py,sha256=mez_zL6Snc5kyWZ8LZ0L1vW7l99FppDWW0WHbJfwZCI,3260
19
+ sdmf/data_movement_framework/load_types/FullLoad.py,sha256=hqCYequ925kGNXqvZNIC8-CqCGIudaGDdzJrN9KQC0Y,3621
20
+ sdmf/data_movement_framework/load_types/IncrementalCDC.py,sha256=QAj5Q3CngcMZ1EvLK3VTzuKLcaAYHUDo3yE-Vpi0mvo,8424
21
+ sdmf/data_movement_framework/load_types/SCDType2.py,sha256=BfN47WBqrx2X2RIy_7PQupHtitzHXhgWqnZLBYh2wpk,7706
22
+ sdmf/data_movement_framework/load_types/StorageFetch.py,sha256=VnPnK7tnKQv4C0etkwg5QgN59j9J31ODBdYzUU4-L0k,5785
23
23
  sdmf/data_movement_framework/load_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  sdmf/data_quality/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- sdmf/data_quality/executors/ComprehensiveDQExecutor.py,sha256=ofgh_zd1Xdr3gdfY3SAd8q8zMv0S3OLLSW4y7eFsPy8,2225
25
+ sdmf/data_quality/executors/ComprehensiveDQExecutor.py,sha256=REstqEtAMuyyi3G-qh9ju2OrX_Q1-cLDqG_36KQj49c,2237
26
26
  sdmf/data_quality/executors/StandardDQExecutor.py,sha256=z4aD8MYi6N1q-NrIsML8bLdU_fzioSVYvRA4PxqnixY,5612
27
27
  sdmf/data_quality/executors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  sdmf/data_quality/model/FeedDQSummaryRow.py,sha256=pjZSYiqV-MAJ1jQGE77jFR5e2EvC1Z5CQUWmMt9YxXc,231
@@ -31,23 +31,15 @@ sdmf/data_quality/report/DQExcelReportWriter.py,sha256=5e7PSiivpFgDrNc5DOpHpRsMg
31
31
  sdmf/data_quality/report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  sdmf/data_quality/runner/FeedDataQualityRunner.py,sha256=3F8mJG1js9A4KAGuiUYDYJF3mrV1BaA0B5gktdmIrPs,4638
33
33
  sdmf/data_quality/runner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- sdmf/exception/BasePipelineException.py,sha256=-npQ3BkZU-g0ltgTONMmWEiEm8HefoORmhcqOOX-gQc,2127
35
- sdmf/exception/DataLoadException.py,sha256=Zs3LTcvuMhdXAIC7RUuBiZaWWtQWatB92aMSMOHiskM,347
36
- sdmf/exception/DataQualityError.py,sha256=wn3QPZNwIxTI19djVsOxFxjzS8S6mgSXKPm2iZfQP64,344
37
- sdmf/exception/DataSpecRuleExecutionError.py,sha256=uzXJJWzMGSR49-IYXbsglPwGaazFdtyLQLpm734V3tI,373
38
- sdmf/exception/DataSpecValidationError.py,sha256=hiMdyuWpxvNS7ma_NUcDNIymsLSfBg4kQG2VZy5rzts,363
39
- sdmf/exception/EnvironmentPreparationError.py,sha256=w5A1UnPMmjOygPPwNjUeYbgrVrfKBhsucsi3_2Ko0go,371
40
- sdmf/exception/ExtractionException.py,sha256=lEYesSwV1XnkHpv9T5mtNrvGU54NsBVVK5arvOz5Jm4,356
41
- sdmf/exception/FeedSpecValidationError.py,sha256=XjLrCozHD2Ao4E6oW0jgRLD5K8zvwKDBYu7Lzbc2VxM,367
42
- sdmf/exception/ResultGenerationException.py,sha256=2ro3Fq3FiSPf3hn1BVBzLIzGzttlYRcaxi4imzk3Q0I,347
43
- sdmf/exception/SystemError.py,sha256=0FIykfTq39UNEm2lXiFT6w1Mype8Q82UD8xYKMZaeuU,344
44
- sdmf/exception/ValidationError.py,sha256=UN_stG-ySzDG9mZ3YMIn-XJddpBeoO2iE89IqEQcfgo,352
34
+ sdmf/exception/BaseException.py,sha256=5WThiNLjkSe9rD5V37RPXR34lYWfVAqR5-EC9sD0Ujk,3080
35
+ sdmf/exception/DataLoadException.py,sha256=kHJnWexMm3sOAwzRX88xod5IkxIuVZqUb8a6Pn84K0A,334
36
+ sdmf/exception/DataQualityException.py,sha256=ww7apnNwikZ2nNk_Cas0Q7kalgCT4-vJgNdyzNs_yf8,348
37
+ sdmf/exception/ExtractionException.py,sha256=0klZU0Jxl1ydARu-c-jjpy-D8ILmRKQFjwm4X_BkcbY,336
38
+ sdmf/exception/ResultGenerationException.py,sha256=4wnGun3JTEpM6oIZFZ8mgtEhUdy2tvS5YuDa2m8y05k,349
39
+ sdmf/exception/StorageFetchException.py,sha256=Zp-r2Xx5PnMIU1PMbvJaoRyDKc4bb34kvY0NU-6-CMk,341
40
+ sdmf/exception/SystemError.py,sha256=qdSjWkxvwOb8dhmBQ-ek0s_5FgH1va7ILinOlwTNck8,320
41
+ sdmf/exception/ValidationError.py,sha256=28YDbXd_1D-YfSUgekkyo1ze_FGI5kythpZ5CKv8fN0,328
45
42
  sdmf/exception/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
- sdmf/extraction_toolkit/ExtractionController.py,sha256=q8Yfms1E0J4twH7Hx8jXN74GUR0XM4EtZv0HlQcoQsI,843
47
- sdmf/extraction_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
- sdmf/extraction_toolkit/data_class/ExtractionConfig.py,sha256=dkJ_rc60RPuktw9MFdmneyxrQ7TgHUkzi83ATTIplxs,162
49
- sdmf/extraction_toolkit/data_class/ExtractionResult.py,sha256=ZzRKCK7vqJ5bk3CfYypXbAHV6tNLJA-asn_904Co1VU,467
50
- sdmf/extraction_toolkit/data_class/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
43
  sdmf/orchestrator/Orchestrator.py,sha256=UKkxnUmWITqx5nI1bsNsTSwdVNlGViN-fz9m5P_-aDI,6254
52
44
  sdmf/orchestrator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
45
  sdmf/result_generator/ResultGenerator.py,sha256=PzmET2fJuhl8bjvUuXOFME2VlcNgKt95q0AD414u-qc,7228
@@ -71,7 +63,7 @@ sdmf/validation/validation_rules/StandardCheckStructureCheck.py,sha256=_5CrGlLsQ
71
63
  sdmf/validation/validation_rules/VacuumHoursCheck.py,sha256=FQI3RRpso2eQc_m6tX41KkOCAViNaKRTS42t8X4cfbQ,982
72
64
  sdmf/validation/validation_rules/ValidateFeedSpecsJSON.py,sha256=JRyYA1DaXHZT94oDWt1wm7Q5sghKe6OrjJydVII2ico,1046
73
65
  sdmf/validation/validation_rules/ValidateMasterSpecs.py,sha256=PJjL_goDrotit3D0bUWkcDqOgoNxPnFvUeGXfjluH54,666
74
- sdmf-0.1.6.dist-info/METADATA,sha256=2WpCdd6_zPHtTDg7pp2tF-nM7a2U76ZewscS1lx4ivs,20979
75
- sdmf-0.1.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
76
- sdmf-0.1.6.dist-info/top_level.txt,sha256=a67a3_q-4a9HG0C80uz5kmOlzfO4AFoqiNag1KhTpUs,5
77
- sdmf-0.1.6.dist-info/RECORD,,
66
+ sdmf-0.1.8.dist-info/METADATA,sha256=5s0O3IFtbREH_aseS-uQFIgv1h3m7eT0jA7GDxITZYs,21159
67
+ sdmf-0.1.8.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
68
+ sdmf-0.1.8.dist-info/top_level.txt,sha256=a67a3_q-4a9HG0C80uz5kmOlzfO4AFoqiNag1KhTpUs,5
69
+ sdmf-0.1.8.dist-info/RECORD,,
@@ -1,23 +0,0 @@
1
- # inbuilt
2
- import logging
3
- from abc import ABC, abstractmethod
4
-
5
- # external
6
- from pyspark.sql import SparkSession, DataFrame
7
-
8
- # internal
9
- from sdmf.data_movement_framework.data_class.LoadConfig import LoadConfig
10
- from sdmf.data_movement_framework.data_class.LoadResult import LoadResult
11
-
12
- class BaseExtractor(ABC):
13
- def __init__(self, config: LoadConfig, spark: SparkSession) -> None:
14
- self.logger = logging.getLogger(__name__)
15
- self.config = config
16
- self.spark = spark
17
-
18
- @abstractmethod
19
- def extract(self) -> LoadResult:
20
- """
21
- Core load logic implemented by subclass.
22
- Should return IngestionResult on success.
23
- """
@@ -1,59 +0,0 @@
1
- import sys
2
- import traceback
3
- import logging
4
-
5
- class BasePipelineException(Exception):
6
- """
7
- Unified base exception for all pipeline errors.
8
- Automatically logs in pretty console format.
9
- """
10
-
11
- def __init__(self, message=None, details=None, context=None, original_exception=None):
12
- super().__init__(message)
13
-
14
- self.message = message or self.__class__.__name__
15
- self.details = details
16
- self.context = context or {}
17
- self.original_exception = original_exception
18
- self.traceback = details or None
19
- self.logger = logging.getLogger(__name__)
20
-
21
- # Capture exception info if available
22
- exc_type, exc_value, _ = sys.exc_info()
23
- self.exc_type = exc_type.__name__ if exc_type else None
24
- self.exc_value = str(exc_value) if exc_value else None
25
- self.full_error_info = ''.join(traceback.format_exception(*sys.exc_info())) if sys.exc_info()[0] else None
26
-
27
-
28
- error_msg = self.__str__()
29
- self.logger.error(f"{error_msg}, Full Message: {self.to_dict()}")
30
-
31
- def __str__(self):
32
- parts = [f"[{self.__class__.__name__}] {self.message}"]
33
-
34
- if self.full_error_info:
35
- parts.append(f"\nStack Trace:\n{self.full_error_info}")
36
- if self.details:
37
- parts.append(f"Details: {self.details}")
38
- if self.context:
39
- parts.append(f"Context: {self.context}")
40
- if self.original_exception:
41
- parts.append(f"Caused by: {repr(self.original_exception)}")
42
- if self.exc_type:
43
- parts.append(f"Exception Type: {self.exc_type}")
44
- if self.exc_value:
45
- parts.append(f"Exception Message: {self.exc_value}")
46
-
47
- return " | ".join(parts)
48
-
49
- def to_dict(self):
50
- """Optional structured output if needed in MLflow or REST."""
51
- return {
52
- "error_type": self.__class__.__name__,
53
- "message": self.message,
54
- "details": self.details,
55
- "context": self.context,
56
- "original_exception": repr(self.original_exception),
57
- "traceback": self.traceback,
58
- }
59
-
@@ -1,8 +0,0 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
2
-
3
- class DataQualityError(BasePipelineException):
4
- def __init__(self, message=None, details=None, original_exception=None):
5
- super().__init__(message)
6
- self.message = message
7
- self.original_exception = original_exception
8
- self.details = details
@@ -1,9 +0,0 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
2
-
3
- class DataSpecRuleExecutionError(BasePipelineException):
4
- def __init__(self, message=None, details=None, original_exception=None):
5
- super().__init__(
6
- message or "DataSpecRuleExecutionError",
7
- details=details,
8
- original_exception=original_exception
9
- )
@@ -1,9 +0,0 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
2
-
3
- class EnvironmentPreparationError(BasePipelineException):
4
- def __init__(self, message=None, details=None, original_exception=None):
5
- super().__init__(
6
- message or "FeedSpecValidationError",
7
- details=details,
8
- original_exception=original_exception
9
- )
@@ -1,9 +0,0 @@
1
- from sdmf.exception.BasePipelineException import BasePipelineException
2
-
3
- class FeedSpecValidationError(BasePipelineException):
4
- def __init__(self, message=None, details=None, original_exception=None):
5
- super().__init__(
6
- message or "FeedSpecValidationError",
7
- details=details,
8
- original_exception=original_exception
9
- )
@@ -1,33 +0,0 @@
1
- # inbuilt
2
- import os
3
- import logging
4
- import configparser
5
- from concurrent.futures import ThreadPoolExecutor, as_completed
6
-
7
-
8
- # external
9
- import pandas as pd
10
- from pyspark.sql import SparkSession
11
-
12
- # internal
13
- from sdmf.extraction_toolkit.data_class.ExtractionConfig import ExtractionConfig
14
- from sdmf.extraction_toolkit.data_class.ExtractionResult import ExtractionResult
15
-
16
-
17
- class ExtractionController():
18
-
19
- def __init__(
20
- self,
21
- spark: SparkSession,
22
- allowed_df: pd.DataFrame,
23
- config: configparser.ConfigParser
24
- ) -> None:
25
- self.logger = logging.getLogger(__name__)
26
- self.logger.info("Extraction Controller has been initialized...")
27
- self.master_specs_df = allowed_df
28
- self.spark = spark
29
- self.extraction_results_list = []
30
- self.config = config
31
-
32
-
33
-
File without changes
@@ -1,9 +0,0 @@
1
- # inbuilt
2
- from dataclasses import dataclass
3
-
4
- @dataclass
5
- class ExtractionConfig:
6
- master_specs: dict
7
- feed_specs: dict
8
- config: dict
9
- target_table: str
@@ -1,19 +0,0 @@
1
- # inbuilt
2
- from dataclasses import dataclass
3
- from typing import Optional
4
-
5
- # external
6
- from pyspark.sql import DataFrame
7
-
8
- @dataclass
9
- class ExtractionResult:
10
- feed_id: int
11
- success: bool
12
- skipped: bool = False
13
- start_epoch: float = 0.0
14
- end_epoch: float = 0.0
15
- total_human_readable_time: str = ""
16
- target_table_path: str = ""
17
- data_frame: Optional[DataFrame] = None
18
- total_rows_inserted: int = 0
19
- exception_if_any: Optional[Exception] = None
File without changes
File without changes