sdmf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdmf/cli/main.py +4 -1
- sdmf/cli/main2.py +8 -1
- sdmf/data_movement_framework/BaseLoadStrategy.py +0 -4
- sdmf/data_movement_framework/LoadDispatcher.py +3 -1
- sdmf/data_movement_framework/load_types/AppendLoad.py +0 -1
- sdmf/data_movement_framework/load_types/FullLoad.py +0 -1
- sdmf/data_movement_framework/load_types/IncrementalCDC.py +0 -2
- sdmf/data_movement_framework/load_types/SCDType2.py +0 -1
- sdmf/data_movement_framework/load_types/StorageFetch.py +144 -0
- sdmf/data_quality/executors/ComprehensiveDQExecutor.py +2 -2
- sdmf/exception/BaseException.py +100 -0
- sdmf/exception/DataLoadException.py +8 -7
- sdmf/exception/{DataSpecValidationError.py → DataQualityException.py} +5 -4
- sdmf/exception/ExtractionException.py +3 -3
- sdmf/exception/ResultGenerationException.py +8 -7
- sdmf/exception/StorageFetchException.py +9 -0
- sdmf/exception/SystemError.py +2 -2
- sdmf/exception/ValidationError.py +3 -3
- {sdmf-0.1.6.dist-info → sdmf-0.1.7.dist-info}/METADATA +8 -5
- {sdmf-0.1.6.dist-info → sdmf-0.1.7.dist-info}/RECORD +22 -30
- sdmf/data_movement_framework/BaseExtractor.py +0 -23
- sdmf/exception/BasePipelineException.py +0 -59
- sdmf/exception/DataQualityError.py +0 -8
- sdmf/exception/DataSpecRuleExecutionError.py +0 -9
- sdmf/exception/EnvironmentPreparationError.py +0 -9
- sdmf/exception/FeedSpecValidationError.py +0 -9
- sdmf/extraction_toolkit/ExtractionController.py +0 -33
- sdmf/extraction_toolkit/__init__.py +0 -0
- sdmf/extraction_toolkit/data_class/ExtractionConfig.py +0 -9
- sdmf/extraction_toolkit/data_class/ExtractionResult.py +0 -19
- sdmf/extraction_toolkit/data_class/__init__.py +0 -0
- {sdmf-0.1.6.dist-info → sdmf-0.1.7.dist-info}/WHEEL +0 -0
- {sdmf-0.1.6.dist-info → sdmf-0.1.7.dist-info}/top_level.txt +0 -0
sdmf/cli/main.py
CHANGED
|
@@ -10,7 +10,10 @@ spark = (
|
|
|
10
10
|
.config("spark.scheduler.mode", "FAIR")
|
|
11
11
|
.config(
|
|
12
12
|
"spark.jars.packages",
|
|
13
|
-
"
|
|
13
|
+
",".join([
|
|
14
|
+
"io.delta:delta-spark_2.12:3.1.0",
|
|
15
|
+
"com.databricks:spark-xml_2.12:0.17.0"
|
|
16
|
+
])
|
|
14
17
|
)
|
|
15
18
|
.config(
|
|
16
19
|
"spark.sql.extensions",
|
sdmf/cli/main2.py
CHANGED
|
@@ -8,9 +8,13 @@ spark = (
|
|
|
8
8
|
SparkSession.builder
|
|
9
9
|
.appName("sdmf")
|
|
10
10
|
.enableHiveSupport()
|
|
11
|
+
.config("spark.scheduler.mode", "FAIR")
|
|
11
12
|
.config(
|
|
12
13
|
"spark.jars.packages",
|
|
13
|
-
"
|
|
14
|
+
",".join([
|
|
15
|
+
"io.delta:delta-spark_2.12:3.1.0",
|
|
16
|
+
"com.databricks:spark-xml_2.12:0.17.0"
|
|
17
|
+
])
|
|
14
18
|
)
|
|
15
19
|
.config(
|
|
16
20
|
"spark.sql.extensions",
|
|
@@ -43,3 +47,6 @@ spark = (
|
|
|
43
47
|
# spark.sql('select count(*) from bronze.t_country_codes').show()
|
|
44
48
|
|
|
45
49
|
|
|
50
|
+
|
|
51
|
+
spark.sql('select * from bronze.t_test2').show(truncate=False)
|
|
52
|
+
|
|
@@ -80,7 +80,6 @@ class BaseLoadStrategy(ABC):
|
|
|
80
80
|
except Exception as e:
|
|
81
81
|
raise DataLoadException(
|
|
82
82
|
message="Somethine went wrong while executing data load",
|
|
83
|
-
load_type=self.config.master_specs["load_type"],
|
|
84
83
|
original_exception=e,
|
|
85
84
|
)
|
|
86
85
|
|
|
@@ -385,7 +384,6 @@ class BaseLoadStrategy(ABC):
|
|
|
385
384
|
except Exception as e:
|
|
386
385
|
raise DataLoadException(
|
|
387
386
|
message=f"Error in staging layer for {self.config.feed_specs['source_table_name']}",
|
|
388
|
-
load_type=self.config.master_specs["load_type"],
|
|
389
387
|
original_exception=e,
|
|
390
388
|
)
|
|
391
389
|
|
|
@@ -416,7 +414,6 @@ class BaseLoadStrategy(ABC):
|
|
|
416
414
|
f"Attempted: '{current_type}'. "
|
|
417
415
|
f"Switching load types is not permitted."
|
|
418
416
|
),
|
|
419
|
-
load_type=self.config.master_specs["load_type"],
|
|
420
417
|
original_exception=None,
|
|
421
418
|
)
|
|
422
419
|
else:
|
|
@@ -438,6 +435,5 @@ class BaseLoadStrategy(ABC):
|
|
|
438
435
|
except Exception as e:
|
|
439
436
|
raise DataLoadException(
|
|
440
437
|
message="Something went wrong while enforcing load type consistency",
|
|
441
|
-
load_type=self.config.master_specs["load_type"],
|
|
442
438
|
original_exception=e,
|
|
443
439
|
)
|
|
@@ -12,6 +12,7 @@ from sdmf.data_movement_framework.load_types.AppendLoad import AppendLoad
|
|
|
12
12
|
from sdmf.data_movement_framework.load_types.IncrementalCDC import IncrementalCDC
|
|
13
13
|
from sdmf.data_movement_framework.load_types.SCDType2 import SCDType2
|
|
14
14
|
from sdmf.data_movement_framework.load_types.APIExtractor import APIExtractor
|
|
15
|
+
from sdmf.data_movement_framework.load_types.StorageFetch import StorageFetch
|
|
15
16
|
from sdmf.data_movement_framework.data_class.LoadConfig import LoadConfig
|
|
16
17
|
from sdmf.data_movement_framework.data_class.LoadResult import LoadResult
|
|
17
18
|
|
|
@@ -56,7 +57,8 @@ class LoadDispatcher():
|
|
|
56
57
|
"SCD_TYPE_2": SCDType2,
|
|
57
58
|
|
|
58
59
|
# extraction
|
|
59
|
-
"API_EXTRACTOR": APIExtractor
|
|
60
|
+
"API_EXTRACTOR": APIExtractor,
|
|
61
|
+
"STORAGE_FETCH":StorageFetch
|
|
60
62
|
}
|
|
61
63
|
|
|
62
64
|
load_class = load_type_map.get(self.master_spec.get('load_type', ""))
|
|
@@ -81,6 +81,5 @@ class FullLoad(BaseLoadStrategy):
|
|
|
81
81
|
except Exception as e:
|
|
82
82
|
raise DataLoadException(
|
|
83
83
|
message=f"Feed ID: {self.config.master_specs['feed_id']}, Error during FULL LOAD for {self._current_target_table_name}: {str(e)}",
|
|
84
|
-
load_type=self.config.master_specs["load_type"],
|
|
85
84
|
original_exception=e
|
|
86
85
|
)
|
|
@@ -134,7 +134,6 @@ class IncrementalCDC(BaseLoadStrategy):
|
|
|
134
134
|
target_df = delta_target.toDF()
|
|
135
135
|
if target_df.columns != incr_df.columns:
|
|
136
136
|
raise DataLoadException(
|
|
137
|
-
load_type=self.config.feed_specs["load_type"],
|
|
138
137
|
original_exception=None,
|
|
139
138
|
message=f"Target table {target_table} schema [{target_df.columns}] does not match incremental data schema [{incr_df.columns}]."
|
|
140
139
|
)
|
|
@@ -191,7 +190,6 @@ class IncrementalCDC(BaseLoadStrategy):
|
|
|
191
190
|
)
|
|
192
191
|
except Exception as e:
|
|
193
192
|
raise DataLoadException(
|
|
194
|
-
load_type=self.config.feed_specs["load_type"],
|
|
195
193
|
original_exception=e,
|
|
196
194
|
message=f"Error during Incremental CDC load for {self._current_target_table_name}: {str(e)}"
|
|
197
195
|
)
|
|
@@ -162,7 +162,6 @@ class SCDType2(BaseLoadStrategy):
|
|
|
162
162
|
)
|
|
163
163
|
except Exception as e:
|
|
164
164
|
raise DataLoadException(
|
|
165
|
-
load_type=self.config.master_specs["load_type"],
|
|
166
165
|
original_exception=e,
|
|
167
166
|
message=f"Error during SCD_TYPE_2 for {self._current_target_table_name}: {str(e)}"
|
|
168
167
|
)
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# inbuilt
|
|
2
|
+
import os
|
|
3
|
+
import uuid
|
|
4
|
+
import time
|
|
5
|
+
import random
|
|
6
|
+
import logging
|
|
7
|
+
import requests
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from requests.exceptions import RequestException
|
|
10
|
+
|
|
11
|
+
# external
|
|
12
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
13
|
+
from pyspark.sql.types import StructType
|
|
14
|
+
from pyspark.sql.functions import input_file_name
|
|
15
|
+
|
|
16
|
+
# internal
|
|
17
|
+
from sdmf.data_movement_framework.BaseLoadStrategy import BaseLoadStrategy
|
|
18
|
+
from sdmf.data_movement_framework.data_class.LoadConfig import LoadConfig
|
|
19
|
+
from sdmf.data_movement_framework.data_class.LoadResult import LoadResult
|
|
20
|
+
from sdmf.exception.StorageFetchException import StorageFetchException
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class StorageFetch(BaseLoadStrategy):
|
|
24
|
+
|
|
25
|
+
def __init__(self, config: LoadConfig, spark: SparkSession) -> None:
|
|
26
|
+
super().__init__(config=config, spark=spark)
|
|
27
|
+
self.logger = logging.getLogger(__name__)
|
|
28
|
+
self.config = config
|
|
29
|
+
self.spark = spark
|
|
30
|
+
self.file_type = self.config.feed_specs['storage_config']['file_type']
|
|
31
|
+
self.lookup_directory = self.config.feed_specs['storage_config']['lookup_directory']
|
|
32
|
+
if self.config.target_unity_catalog == "testing":
|
|
33
|
+
self.__bronze_schema = f"bronze"
|
|
34
|
+
else:
|
|
35
|
+
self.__bronze_schema = f"{self.config.target_unity_catalog}.bronze"
|
|
36
|
+
self.logger.warning('Storage Fetch will always dump data in bronze schema as per medallion architecture.')
|
|
37
|
+
|
|
38
|
+
def load(self) -> LoadResult:
|
|
39
|
+
try:
|
|
40
|
+
|
|
41
|
+
results_df = self.__load_file_to_dataframe()
|
|
42
|
+
self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {self.__bronze_schema}")
|
|
43
|
+
feed_temp = (
|
|
44
|
+
f"{self.__bronze_schema}."
|
|
45
|
+
f"{self.config.master_specs['target_table_name']}"
|
|
46
|
+
)
|
|
47
|
+
self.logger.info(f"Creating bronze table: {feed_temp}")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
(
|
|
51
|
+
results_df.write.
|
|
52
|
+
format("delta")
|
|
53
|
+
.mode("overwrite")
|
|
54
|
+
.saveAsTable(feed_temp)
|
|
55
|
+
)
|
|
56
|
+
return LoadResult(
|
|
57
|
+
feed_id = self.config.master_specs['feed_id'],
|
|
58
|
+
success=True,
|
|
59
|
+
total_rows_inserted=results_df.count(),
|
|
60
|
+
total_rows_updated=0,
|
|
61
|
+
total_rows_deleted=0
|
|
62
|
+
)
|
|
63
|
+
except Exception as e:
|
|
64
|
+
raise StorageFetchException(
|
|
65
|
+
message=f"Feed ID: {self.config.master_specs['feed_id']}, Error during FULL LOAD for {self._current_target_table_name}: {str(e)}",
|
|
66
|
+
original_exception=e
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def __iterate_over_latest_medallion_directory(self, base_path) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Returns the maximum integer directory under base_path.
|
|
72
|
+
Ignores files.
|
|
73
|
+
"""
|
|
74
|
+
max_dir = float('-inf')
|
|
75
|
+
for item in os.listdir(base_path):
|
|
76
|
+
if max_dir < int(item):
|
|
77
|
+
max_dir = int(item)
|
|
78
|
+
return str(max_dir)
|
|
79
|
+
|
|
80
|
+
def __load_file_to_dataframe(self) -> DataFrame:
|
|
81
|
+
file_path = self.__build_file_destination_directory(self.lookup_directory)
|
|
82
|
+
self.logger.info(f"Fetching data from path: {file_path}")
|
|
83
|
+
|
|
84
|
+
if self.file_type == 'xml':
|
|
85
|
+
df = (
|
|
86
|
+
self.spark.read
|
|
87
|
+
.format("xml")
|
|
88
|
+
.option(
|
|
89
|
+
"rowTag",
|
|
90
|
+
self.config.feed_specs['storage_config']['xml_row_tag']
|
|
91
|
+
)
|
|
92
|
+
.load(file_path)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
elif self.file_type == 'json':
|
|
96
|
+
df = (
|
|
97
|
+
self.spark.read
|
|
98
|
+
.format("json")
|
|
99
|
+
.load(file_path)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
elif self.file_type == 'parquet':
|
|
103
|
+
df = (
|
|
104
|
+
self.spark.read
|
|
105
|
+
.format("json")
|
|
106
|
+
.load(file_path)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
else:
|
|
110
|
+
raise StorageFetchException(
|
|
111
|
+
"Invalid/missing value for [file_type] parameter in feed specs"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
schema = StructType.fromJson(self.config.feed_specs['selection_schema'])
|
|
115
|
+
df = self._enforce_schema(df, schema)
|
|
116
|
+
df = df.withColumn("_x_source_file", input_file_name())
|
|
117
|
+
|
|
118
|
+
return df
|
|
119
|
+
|
|
120
|
+
def __build_file_destination_directory(self, base_path_prefix: str) -> str:
|
|
121
|
+
storage_type = self.config.feed_specs['storage_config']['storage_type']
|
|
122
|
+
is_multi_file = self.config.feed_specs['storage_config']['is_multi_file']
|
|
123
|
+
inside_timestamp_dir = self.config.feed_specs['storage_config']['inside_timestamp_dir']
|
|
124
|
+
file_name = self.config.feed_specs['storage_config']['file_name']
|
|
125
|
+
|
|
126
|
+
if storage_type == 'MEDALLION':
|
|
127
|
+
current_year = self.__iterate_over_latest_medallion_directory(base_path_prefix)
|
|
128
|
+
current_month = self.__iterate_over_latest_medallion_directory(os.path.join(base_path_prefix, current_year))
|
|
129
|
+
current_day = self.__iterate_over_latest_medallion_directory(os.path.join(base_path_prefix, current_year, current_month))
|
|
130
|
+
latest_timestamp = self.__iterate_over_latest_medallion_directory(os.path.join(base_path_prefix, current_year, current_month, current_day))
|
|
131
|
+
if is_multi_file == True:
|
|
132
|
+
return f"{base_path_prefix}/{current_year}/{current_month}/{current_day}/{latest_timestamp}/{inside_timestamp_dir}/*.{self.file_type}"
|
|
133
|
+
else:
|
|
134
|
+
return f"{base_path_prefix}/{current_year}/{current_month}/{current_day}/{latest_timestamp}/{inside_timestamp_dir}/{file_name}"
|
|
135
|
+
|
|
136
|
+
elif storage_type == 'STANDARD':
|
|
137
|
+
if is_multi_file == True:
|
|
138
|
+
return f"{base_path_prefix}/*.{self.file_type}"
|
|
139
|
+
else:
|
|
140
|
+
return f"{base_path_prefix}/{file_name}"
|
|
141
|
+
else:
|
|
142
|
+
raise StorageFetchException(
|
|
143
|
+
"Invalid/missing value for [storage_type] parameter in feed specs"
|
|
144
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import logging
|
|
3
3
|
|
|
4
4
|
# internal
|
|
5
|
-
from sdmf.exception.
|
|
5
|
+
from sdmf.exception.DataQualityException import DataQualityException
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class ComprehensiveDQExecutor:
|
|
@@ -24,7 +24,7 @@ class ComprehensiveDQExecutor:
|
|
|
24
24
|
dependency_ds = check.get("dependency_dataset", [])
|
|
25
25
|
for dds in dependency_ds:
|
|
26
26
|
if self.spark.catalog.tableExists(dds) == False:
|
|
27
|
-
raise
|
|
27
|
+
raise DataQualityException
|
|
28
28
|
query = check.get("query")
|
|
29
29
|
severity = check.get("severity", "").upper()
|
|
30
30
|
threshold = check.get("threshold", 0)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# inbuilt
|
|
2
|
+
import sys
|
|
3
|
+
import traceback
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseException(Exception):
|
|
8
|
+
"""
|
|
9
|
+
Unified base exception for all SDMF pipeline errors.
|
|
10
|
+
Automatically logs a clean, human-readable error block.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
message=None,
|
|
16
|
+
details=None,
|
|
17
|
+
context=None,
|
|
18
|
+
original_exception=None,
|
|
19
|
+
log=True,
|
|
20
|
+
):
|
|
21
|
+
super().__init__(message)
|
|
22
|
+
|
|
23
|
+
self.message = message or self.__class__.__name__
|
|
24
|
+
self.details = details
|
|
25
|
+
self.context = context or {}
|
|
26
|
+
self.original_exception = original_exception
|
|
27
|
+
|
|
28
|
+
# Capture traceback safely
|
|
29
|
+
exc_type, exc_value, exc_tb = sys.exc_info()
|
|
30
|
+
self.exc_type = exc_type.__name__ if exc_type else None
|
|
31
|
+
self.exc_value = str(exc_value) if exc_value else None
|
|
32
|
+
self.full_traceback = (
|
|
33
|
+
"".join(traceback.format_exception(exc_type, exc_value, exc_tb))
|
|
34
|
+
if exc_type
|
|
35
|
+
else None
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
self.logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
# Log once, cleanly
|
|
41
|
+
if log:
|
|
42
|
+
self.logger.error(self.to_pretty_text())
|
|
43
|
+
|
|
44
|
+
# --------------------------------------------------
|
|
45
|
+
# Human-readable output (for logs / console)
|
|
46
|
+
# --------------------------------------------------
|
|
47
|
+
def __str__(self):
|
|
48
|
+
return self.to_pretty_text()
|
|
49
|
+
|
|
50
|
+
def to_pretty_text(self):
|
|
51
|
+
return f"""
|
|
52
|
+
==================== SDMF ERROR ====================
|
|
53
|
+
|
|
54
|
+
Error Type:
|
|
55
|
+
{self.__class__.__name__}
|
|
56
|
+
|
|
57
|
+
Message:
|
|
58
|
+
{self.message}
|
|
59
|
+
|
|
60
|
+
-------------------- DETAILS --------------------
|
|
61
|
+
{self._format_block(self.details)}
|
|
62
|
+
|
|
63
|
+
-------------------- CONTEXT --------------------
|
|
64
|
+
{self._format_block(self.context)}
|
|
65
|
+
|
|
66
|
+
------------- ORIGINAL EXCEPTION ---------------
|
|
67
|
+
{self._format_block(repr(self.original_exception) if self.original_exception else None)}
|
|
68
|
+
|
|
69
|
+
------------------ STACK TRACE ------------------
|
|
70
|
+
{self._format_block(self.full_traceback)}
|
|
71
|
+
|
|
72
|
+
=================================================
|
|
73
|
+
""".strip()
|
|
74
|
+
|
|
75
|
+
# --------------------------------------------------
|
|
76
|
+
# Structured output (for MLflow / REST / JSON)
|
|
77
|
+
# --------------------------------------------------
|
|
78
|
+
def to_dict(self):
|
|
79
|
+
"""Structured error payload for APIs, MLflow, or persistence."""
|
|
80
|
+
return {
|
|
81
|
+
"error_type": self.__class__.__name__,
|
|
82
|
+
"message": self.message,
|
|
83
|
+
"details": self.details,
|
|
84
|
+
"context": self.context,
|
|
85
|
+
"original_exception": repr(self.original_exception)
|
|
86
|
+
if self.original_exception
|
|
87
|
+
else None,
|
|
88
|
+
"exception_type": self.exc_type,
|
|
89
|
+
"exception_message": self.exc_value,
|
|
90
|
+
"traceback": self.full_traceback,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# --------------------------------------------------
|
|
94
|
+
# Helpers
|
|
95
|
+
# --------------------------------------------------
|
|
96
|
+
@staticmethod
|
|
97
|
+
def _format_block(value):
|
|
98
|
+
if value in (None, "", {}, []):
|
|
99
|
+
return "N/A"
|
|
100
|
+
return value
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from sdmf.exception.
|
|
1
|
+
from sdmf.exception.BaseException import BaseException
|
|
2
2
|
|
|
3
|
-
class DataLoadException(
|
|
4
|
-
def __init__(self, message
|
|
5
|
-
super().__init__(
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
3
|
+
class DataLoadException(BaseException):
|
|
4
|
+
def __init__(self, message=None, details=None, original_exception=None):
|
|
5
|
+
super().__init__(
|
|
6
|
+
message or "Data Load Exception",
|
|
7
|
+
details=details,
|
|
8
|
+
original_exception=original_exception
|
|
9
|
+
)
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from sdmf.exception.
|
|
1
|
+
from sdmf.exception.BaseException import BaseException
|
|
2
2
|
|
|
3
|
-
class
|
|
3
|
+
class DataQualityException(BaseException):
|
|
4
4
|
def __init__(self, message=None, details=None, original_exception=None):
|
|
5
5
|
super().__init__(
|
|
6
|
-
message or "
|
|
6
|
+
message or "Data Quality Exception",
|
|
7
7
|
details=details,
|
|
8
8
|
original_exception=original_exception
|
|
9
|
-
)
|
|
9
|
+
)
|
|
10
|
+
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from sdmf.exception.
|
|
1
|
+
from sdmf.exception.BaseException import BaseException
|
|
2
2
|
|
|
3
|
-
class ExtractionException(
|
|
3
|
+
class ExtractionException(BaseException):
|
|
4
4
|
def __init__(self, message=None, details=None, original_exception=None):
|
|
5
5
|
super().__init__(
|
|
6
|
-
message or "Extraction
|
|
6
|
+
message or "Extraction Exception",
|
|
7
7
|
details=details,
|
|
8
8
|
original_exception=original_exception
|
|
9
9
|
)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from sdmf.exception.
|
|
1
|
+
from sdmf.exception.BaseException import BaseException
|
|
2
2
|
|
|
3
|
-
class ResultGenerationException(
|
|
4
|
-
def __init__(self, message
|
|
5
|
-
super().__init__(
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
3
|
+
class ResultGenerationException(BaseException):
|
|
4
|
+
def __init__(self, message=None, details=None, original_exception=None):
|
|
5
|
+
super().__init__(
|
|
6
|
+
message or "Result Generation Exception",
|
|
7
|
+
details=details,
|
|
8
|
+
original_exception=original_exception
|
|
9
|
+
)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from sdmf.exception.BaseException import BaseException
|
|
2
|
+
|
|
3
|
+
class StorageFetchException(BaseException):
|
|
4
|
+
def __init__(self, message=None, details=None, original_exception=None):
|
|
5
|
+
super().__init__(
|
|
6
|
+
message or "Storage Fetch Exception",
|
|
7
|
+
details=details,
|
|
8
|
+
original_exception=original_exception
|
|
9
|
+
)
|
sdmf/exception/SystemError.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
from sdmf.exception.
|
|
1
|
+
from sdmf.exception.BaseException import BaseException
|
|
2
2
|
|
|
3
|
-
class SystemError(
|
|
3
|
+
class SystemError(BaseException):
|
|
4
4
|
def __init__(self, message=None, details=None, original_exception=None):
|
|
5
5
|
super().__init__(
|
|
6
6
|
message or "System Error",
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from sdmf.exception.
|
|
1
|
+
from sdmf.exception.BaseException import BaseException
|
|
2
2
|
|
|
3
|
-
class ValidationError(
|
|
3
|
+
class ValidationError(BaseException):
|
|
4
4
|
def __init__(self, message=None, details=None, original_exception=None):
|
|
5
5
|
super().__init__(
|
|
6
|
-
message or "
|
|
6
|
+
message or "Validation Error",
|
|
7
7
|
details=details,
|
|
8
8
|
original_exception=original_exception
|
|
9
9
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sdmf
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: SDMF - Standard Data Management Framework
|
|
5
5
|
Author: Harsh Hando
|
|
6
6
|
Author-email: Harsh Handoo <handoo.harsh@gmail.com>
|
|
@@ -206,12 +206,13 @@ License: Apache License
|
|
|
206
206
|
See the License for the specific language governing permissions and
|
|
207
207
|
limitations under the License.
|
|
208
208
|
|
|
209
|
+
Project-URL: Homepage, https://github.com/yalsworldofficial/standard-data-management-framework
|
|
210
|
+
Project-URL: Repository, https://github.com/yalsworldofficial/standard-data-management-framework
|
|
211
|
+
Project-URL: Documentation, https://github.com/yalsworldofficial/standard-data-management-framework#readme
|
|
212
|
+
Project-URL: Changelog, https://github.com/yalsworldofficial/standard-data-management-framework/blob/main/CHANGELOG.md
|
|
213
|
+
Project-URL: Issues, https://github.com/yalsworldofficial/standard-data-management-framework/issues
|
|
209
214
|
Project-URL: License, https://github.com/yalsworldofficial/standard-data-management-framework/blob/main/LICENSE
|
|
210
215
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
211
|
-
Classifier: Programming Language :: Python :: 3
|
|
212
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
213
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
214
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
215
216
|
Classifier: Programming Language :: Python :: 3.12
|
|
216
217
|
Classifier: Operating System :: OS Independent
|
|
217
218
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -220,6 +221,8 @@ Description-Content-Type: text/markdown
|
|
|
220
221
|
Requires-Dist: pandas
|
|
221
222
|
Requires-Dist: openpyxl
|
|
222
223
|
Requires-Dist: matplotlib
|
|
224
|
+
Requires-Dist: delta-spark
|
|
225
|
+
Requires-Dist: numpy
|
|
223
226
|
|
|
224
227
|
# Standard Data Management Framework (SDMF)
|
|
225
228
|
|
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
sdmf/__init__.py,sha256=dsAMhbmYMsgekduahb91sqYr9I2hag3Ezhlzsjo3k5g,117
|
|
2
2
|
sdmf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
sdmf/cli/main.py,sha256=
|
|
4
|
-
sdmf/cli/main2.py,sha256=
|
|
3
|
+
sdmf/cli/main.py,sha256=FiUoC8Zbr2oemaQKlHglJZjVNLrgDUGqCTa2TBmN_Go,3486
|
|
4
|
+
sdmf/cli/main2.py,sha256=2RQ1YZvaTPMwEWz7ysF73uP-lCzF6wITjVds5gXa3w8,1604
|
|
5
5
|
sdmf/cli/tt.py,sha256=ocxNSD93-bGWUk25cBJC8Zl3-mHCnvp45R3QLnwtNkI,128
|
|
6
6
|
sdmf/config/LoggingConfig.py,sha256=-HeVRB12DNh5Lv8RTCxAY_jCDH-EKbcS2xV0tCgksLg,2984
|
|
7
7
|
sdmf/config/LoggingPrettyFormatter.py,sha256=3UJBwmI_szxct1auy_YX9cM6qHD9EW1D_aSxlpJn6K8,1045
|
|
8
8
|
sdmf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
sdmf/data_flow_diagram_generator/DataFlowDiagramGenerator.py,sha256=nerO6bH__g80VeqNQArFwEpdwJC9X1isLU3Q6hsAs6A,5852
|
|
10
|
-
sdmf/data_movement_framework/
|
|
11
|
-
sdmf/data_movement_framework/BaseLoadStrategy.py,sha256=FQAMu60iUxoQRY9MM-sNK9jbEPjjBrbIIEn5xm9NFvc,19421
|
|
10
|
+
sdmf/data_movement_framework/BaseLoadStrategy.py,sha256=0JTwnckFKtYwhWxIzQJ4N1XQ07yAyG70-21SUSZfSfk,19149
|
|
12
11
|
sdmf/data_movement_framework/DataLoadController.py,sha256=e8NtvsK4gXQniym4DhjVSX9RT6NmF4klyaiXv2aYKx0,2797
|
|
13
|
-
sdmf/data_movement_framework/LoadDispatcher.py,sha256=
|
|
12
|
+
sdmf/data_movement_framework/LoadDispatcher.py,sha256=UNbPnOXgc58nVWY-_dWKKInhxkgt11KQpyIvSKOc5GI,3846
|
|
14
13
|
sdmf/data_movement_framework/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
14
|
sdmf/data_movement_framework/data_class/LoadConfig.py,sha256=Jn0Un-Am-iJegtNpWBfo9NkXQRfErCf-EUzJA4oTe_A,262
|
|
16
15
|
sdmf/data_movement_framework/data_class/LoadResult.py,sha256=XX5CUW50RS4n3igI3P6s6U2Oa4eGF66g_Zzh1cr1XSM,558
|
|
17
16
|
sdmf/data_movement_framework/data_class/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
17
|
sdmf/data_movement_framework/load_types/APIExtractor.py,sha256=g1dLjDtI27qdzyfSGrjtIC154TngInrbjZP1yyKD1DI,6927
|
|
19
|
-
sdmf/data_movement_framework/load_types/AppendLoad.py,sha256=
|
|
20
|
-
sdmf/data_movement_framework/load_types/FullLoad.py,sha256=
|
|
21
|
-
sdmf/data_movement_framework/load_types/IncrementalCDC.py,sha256=
|
|
22
|
-
sdmf/data_movement_framework/load_types/SCDType2.py,sha256=
|
|
18
|
+
sdmf/data_movement_framework/load_types/AppendLoad.py,sha256=mez_zL6Snc5kyWZ8LZ0L1vW7l99FppDWW0WHbJfwZCI,3260
|
|
19
|
+
sdmf/data_movement_framework/load_types/FullLoad.py,sha256=hqCYequ925kGNXqvZNIC8-CqCGIudaGDdzJrN9KQC0Y,3621
|
|
20
|
+
sdmf/data_movement_framework/load_types/IncrementalCDC.py,sha256=QAj5Q3CngcMZ1EvLK3VTzuKLcaAYHUDo3yE-Vpi0mvo,8424
|
|
21
|
+
sdmf/data_movement_framework/load_types/SCDType2.py,sha256=BfN47WBqrx2X2RIy_7PQupHtitzHXhgWqnZLBYh2wpk,7706
|
|
22
|
+
sdmf/data_movement_framework/load_types/StorageFetch.py,sha256=VnPnK7tnKQv4C0etkwg5QgN59j9J31ODBdYzUU4-L0k,5785
|
|
23
23
|
sdmf/data_movement_framework/load_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
sdmf/data_quality/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
sdmf/data_quality/executors/ComprehensiveDQExecutor.py,sha256=
|
|
25
|
+
sdmf/data_quality/executors/ComprehensiveDQExecutor.py,sha256=REstqEtAMuyyi3G-qh9ju2OrX_Q1-cLDqG_36KQj49c,2237
|
|
26
26
|
sdmf/data_quality/executors/StandardDQExecutor.py,sha256=z4aD8MYi6N1q-NrIsML8bLdU_fzioSVYvRA4PxqnixY,5612
|
|
27
27
|
sdmf/data_quality/executors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
28
|
sdmf/data_quality/model/FeedDQSummaryRow.py,sha256=pjZSYiqV-MAJ1jQGE77jFR5e2EvC1Z5CQUWmMt9YxXc,231
|
|
@@ -31,23 +31,15 @@ sdmf/data_quality/report/DQExcelReportWriter.py,sha256=5e7PSiivpFgDrNc5DOpHpRsMg
|
|
|
31
31
|
sdmf/data_quality/report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
32
|
sdmf/data_quality/runner/FeedDataQualityRunner.py,sha256=3F8mJG1js9A4KAGuiUYDYJF3mrV1BaA0B5gktdmIrPs,4638
|
|
33
33
|
sdmf/data_quality/runner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
|
-
sdmf/exception/
|
|
35
|
-
sdmf/exception/DataLoadException.py,sha256=
|
|
36
|
-
sdmf/exception/
|
|
37
|
-
sdmf/exception/
|
|
38
|
-
sdmf/exception/
|
|
39
|
-
sdmf/exception/
|
|
40
|
-
sdmf/exception/
|
|
41
|
-
sdmf/exception/
|
|
42
|
-
sdmf/exception/ResultGenerationException.py,sha256=2ro3Fq3FiSPf3hn1BVBzLIzGzttlYRcaxi4imzk3Q0I,347
|
|
43
|
-
sdmf/exception/SystemError.py,sha256=0FIykfTq39UNEm2lXiFT6w1Mype8Q82UD8xYKMZaeuU,344
|
|
44
|
-
sdmf/exception/ValidationError.py,sha256=UN_stG-ySzDG9mZ3YMIn-XJddpBeoO2iE89IqEQcfgo,352
|
|
34
|
+
sdmf/exception/BaseException.py,sha256=5WThiNLjkSe9rD5V37RPXR34lYWfVAqR5-EC9sD0Ujk,3080
|
|
35
|
+
sdmf/exception/DataLoadException.py,sha256=kHJnWexMm3sOAwzRX88xod5IkxIuVZqUb8a6Pn84K0A,334
|
|
36
|
+
sdmf/exception/DataQualityException.py,sha256=ww7apnNwikZ2nNk_Cas0Q7kalgCT4-vJgNdyzNs_yf8,348
|
|
37
|
+
sdmf/exception/ExtractionException.py,sha256=0klZU0Jxl1ydARu-c-jjpy-D8ILmRKQFjwm4X_BkcbY,336
|
|
38
|
+
sdmf/exception/ResultGenerationException.py,sha256=4wnGun3JTEpM6oIZFZ8mgtEhUdy2tvS5YuDa2m8y05k,349
|
|
39
|
+
sdmf/exception/StorageFetchException.py,sha256=Zp-r2Xx5PnMIU1PMbvJaoRyDKc4bb34kvY0NU-6-CMk,341
|
|
40
|
+
sdmf/exception/SystemError.py,sha256=qdSjWkxvwOb8dhmBQ-ek0s_5FgH1va7ILinOlwTNck8,320
|
|
41
|
+
sdmf/exception/ValidationError.py,sha256=28YDbXd_1D-YfSUgekkyo1ze_FGI5kythpZ5CKv8fN0,328
|
|
45
42
|
sdmf/exception/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
|
-
sdmf/extraction_toolkit/ExtractionController.py,sha256=q8Yfms1E0J4twH7Hx8jXN74GUR0XM4EtZv0HlQcoQsI,843
|
|
47
|
-
sdmf/extraction_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
|
-
sdmf/extraction_toolkit/data_class/ExtractionConfig.py,sha256=dkJ_rc60RPuktw9MFdmneyxrQ7TgHUkzi83ATTIplxs,162
|
|
49
|
-
sdmf/extraction_toolkit/data_class/ExtractionResult.py,sha256=ZzRKCK7vqJ5bk3CfYypXbAHV6tNLJA-asn_904Co1VU,467
|
|
50
|
-
sdmf/extraction_toolkit/data_class/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
43
|
sdmf/orchestrator/Orchestrator.py,sha256=UKkxnUmWITqx5nI1bsNsTSwdVNlGViN-fz9m5P_-aDI,6254
|
|
52
44
|
sdmf/orchestrator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
45
|
sdmf/result_generator/ResultGenerator.py,sha256=PzmET2fJuhl8bjvUuXOFME2VlcNgKt95q0AD414u-qc,7228
|
|
@@ -71,7 +63,7 @@ sdmf/validation/validation_rules/StandardCheckStructureCheck.py,sha256=_5CrGlLsQ
|
|
|
71
63
|
sdmf/validation/validation_rules/VacuumHoursCheck.py,sha256=FQI3RRpso2eQc_m6tX41KkOCAViNaKRTS42t8X4cfbQ,982
|
|
72
64
|
sdmf/validation/validation_rules/ValidateFeedSpecsJSON.py,sha256=JRyYA1DaXHZT94oDWt1wm7Q5sghKe6OrjJydVII2ico,1046
|
|
73
65
|
sdmf/validation/validation_rules/ValidateMasterSpecs.py,sha256=PJjL_goDrotit3D0bUWkcDqOgoNxPnFvUeGXfjluH54,666
|
|
74
|
-
sdmf-0.1.
|
|
75
|
-
sdmf-0.1.
|
|
76
|
-
sdmf-0.1.
|
|
77
|
-
sdmf-0.1.
|
|
66
|
+
sdmf-0.1.7.dist-info/METADATA,sha256=nSrsNqLm0a0IxH4RcXA3_ywXwbxqCoXwmM666ZM16lc,21345
|
|
67
|
+
sdmf-0.1.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
68
|
+
sdmf-0.1.7.dist-info/top_level.txt,sha256=a67a3_q-4a9HG0C80uz5kmOlzfO4AFoqiNag1KhTpUs,5
|
|
69
|
+
sdmf-0.1.7.dist-info/RECORD,,
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
# inbuilt
|
|
2
|
-
import logging
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
|
|
5
|
-
# external
|
|
6
|
-
from pyspark.sql import SparkSession, DataFrame
|
|
7
|
-
|
|
8
|
-
# internal
|
|
9
|
-
from sdmf.data_movement_framework.data_class.LoadConfig import LoadConfig
|
|
10
|
-
from sdmf.data_movement_framework.data_class.LoadResult import LoadResult
|
|
11
|
-
|
|
12
|
-
class BaseExtractor(ABC):
|
|
13
|
-
def __init__(self, config: LoadConfig, spark: SparkSession) -> None:
|
|
14
|
-
self.logger = logging.getLogger(__name__)
|
|
15
|
-
self.config = config
|
|
16
|
-
self.spark = spark
|
|
17
|
-
|
|
18
|
-
@abstractmethod
|
|
19
|
-
def extract(self) -> LoadResult:
|
|
20
|
-
"""
|
|
21
|
-
Core load logic implemented by subclass.
|
|
22
|
-
Should return IngestionResult on success.
|
|
23
|
-
"""
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import traceback
|
|
3
|
-
import logging
|
|
4
|
-
|
|
5
|
-
class BasePipelineException(Exception):
|
|
6
|
-
"""
|
|
7
|
-
Unified base exception for all pipeline errors.
|
|
8
|
-
Automatically logs in pretty console format.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, message=None, details=None, context=None, original_exception=None):
|
|
12
|
-
super().__init__(message)
|
|
13
|
-
|
|
14
|
-
self.message = message or self.__class__.__name__
|
|
15
|
-
self.details = details
|
|
16
|
-
self.context = context or {}
|
|
17
|
-
self.original_exception = original_exception
|
|
18
|
-
self.traceback = details or None
|
|
19
|
-
self.logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
# Capture exception info if available
|
|
22
|
-
exc_type, exc_value, _ = sys.exc_info()
|
|
23
|
-
self.exc_type = exc_type.__name__ if exc_type else None
|
|
24
|
-
self.exc_value = str(exc_value) if exc_value else None
|
|
25
|
-
self.full_error_info = ''.join(traceback.format_exception(*sys.exc_info())) if sys.exc_info()[0] else None
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
error_msg = self.__str__()
|
|
29
|
-
self.logger.error(f"{error_msg}, Full Message: {self.to_dict()}")
|
|
30
|
-
|
|
31
|
-
def __str__(self):
|
|
32
|
-
parts = [f"[{self.__class__.__name__}] {self.message}"]
|
|
33
|
-
|
|
34
|
-
if self.full_error_info:
|
|
35
|
-
parts.append(f"\nStack Trace:\n{self.full_error_info}")
|
|
36
|
-
if self.details:
|
|
37
|
-
parts.append(f"Details: {self.details}")
|
|
38
|
-
if self.context:
|
|
39
|
-
parts.append(f"Context: {self.context}")
|
|
40
|
-
if self.original_exception:
|
|
41
|
-
parts.append(f"Caused by: {repr(self.original_exception)}")
|
|
42
|
-
if self.exc_type:
|
|
43
|
-
parts.append(f"Exception Type: {self.exc_type}")
|
|
44
|
-
if self.exc_value:
|
|
45
|
-
parts.append(f"Exception Message: {self.exc_value}")
|
|
46
|
-
|
|
47
|
-
return " | ".join(parts)
|
|
48
|
-
|
|
49
|
-
def to_dict(self):
|
|
50
|
-
"""Optional structured output if needed in MLflow or REST."""
|
|
51
|
-
return {
|
|
52
|
-
"error_type": self.__class__.__name__,
|
|
53
|
-
"message": self.message,
|
|
54
|
-
"details": self.details,
|
|
55
|
-
"context": self.context,
|
|
56
|
-
"original_exception": repr(self.original_exception),
|
|
57
|
-
"traceback": self.traceback,
|
|
58
|
-
}
|
|
59
|
-
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
from sdmf.exception.BasePipelineException import BasePipelineException
|
|
2
|
-
|
|
3
|
-
class DataQualityError(BasePipelineException):
|
|
4
|
-
def __init__(self, message=None, details=None, original_exception=None):
|
|
5
|
-
super().__init__(message)
|
|
6
|
-
self.message = message
|
|
7
|
-
self.original_exception = original_exception
|
|
8
|
-
self.details = details
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
from sdmf.exception.BasePipelineException import BasePipelineException
|
|
2
|
-
|
|
3
|
-
class DataSpecRuleExecutionError(BasePipelineException):
|
|
4
|
-
def __init__(self, message=None, details=None, original_exception=None):
|
|
5
|
-
super().__init__(
|
|
6
|
-
message or "DataSpecRuleExecutionError",
|
|
7
|
-
details=details,
|
|
8
|
-
original_exception=original_exception
|
|
9
|
-
)
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
from sdmf.exception.BasePipelineException import BasePipelineException
|
|
2
|
-
|
|
3
|
-
class EnvironmentPreparationError(BasePipelineException):
|
|
4
|
-
def __init__(self, message=None, details=None, original_exception=None):
|
|
5
|
-
super().__init__(
|
|
6
|
-
message or "FeedSpecValidationError",
|
|
7
|
-
details=details,
|
|
8
|
-
original_exception=original_exception
|
|
9
|
-
)
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
from sdmf.exception.BasePipelineException import BasePipelineException
|
|
2
|
-
|
|
3
|
-
class FeedSpecValidationError(BasePipelineException):
|
|
4
|
-
def __init__(self, message=None, details=None, original_exception=None):
|
|
5
|
-
super().__init__(
|
|
6
|
-
message or "FeedSpecValidationError",
|
|
7
|
-
details=details,
|
|
8
|
-
original_exception=original_exception
|
|
9
|
-
)
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
# inbuilt
|
|
2
|
-
import os
|
|
3
|
-
import logging
|
|
4
|
-
import configparser
|
|
5
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
# external
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from pyspark.sql import SparkSession
|
|
11
|
-
|
|
12
|
-
# internal
|
|
13
|
-
from sdmf.extraction_toolkit.data_class.ExtractionConfig import ExtractionConfig
|
|
14
|
-
from sdmf.extraction_toolkit.data_class.ExtractionResult import ExtractionResult
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class ExtractionController():
|
|
18
|
-
|
|
19
|
-
def __init__(
|
|
20
|
-
self,
|
|
21
|
-
spark: SparkSession,
|
|
22
|
-
allowed_df: pd.DataFrame,
|
|
23
|
-
config: configparser.ConfigParser
|
|
24
|
-
) -> None:
|
|
25
|
-
self.logger = logging.getLogger(__name__)
|
|
26
|
-
self.logger.info("Extraction Controller has been initialized...")
|
|
27
|
-
self.master_specs_df = allowed_df
|
|
28
|
-
self.spark = spark
|
|
29
|
-
self.extraction_results_list = []
|
|
30
|
-
self.config = config
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
File without changes
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
# inbuilt
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from typing import Optional
|
|
4
|
-
|
|
5
|
-
# external
|
|
6
|
-
from pyspark.sql import DataFrame
|
|
7
|
-
|
|
8
|
-
@dataclass
|
|
9
|
-
class ExtractionResult:
|
|
10
|
-
feed_id: int
|
|
11
|
-
success: bool
|
|
12
|
-
skipped: bool = False
|
|
13
|
-
start_epoch: float = 0.0
|
|
14
|
-
end_epoch: float = 0.0
|
|
15
|
-
total_human_readable_time: str = ""
|
|
16
|
-
target_table_path: str = ""
|
|
17
|
-
data_frame: Optional[DataFrame] = None
|
|
18
|
-
total_rows_inserted: int = 0
|
|
19
|
-
exception_if_any: Optional[Exception] = None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|