data-engineering-exp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_engineering_exp/__init__.py +0 -0
- data_engineering_exp/cli.py +63 -0
- data_engineering_exp/core/__init__.py +0 -0
- data_engineering_exp/core/catalog.py +197 -0
- data_engineering_exp/core/deduplication.py +192 -0
- data_engineering_exp/core/initializer.py +93 -0
- data_engineering_exp/core/io.py +134 -0
- data_engineering_exp/pandas_core/__init__.py +0 -0
- data_engineering_exp/pandas_core/scd2.py +225 -0
- data_engineering_exp/spark_core/__init__.py +0 -0
- data_engineering_exp/spark_core/scd2.py +283 -0
- data_engineering_exp-0.1.0.dist-info/LICENSE +21 -0
- data_engineering_exp-0.1.0.dist-info/METADATA +107 -0
- data_engineering_exp-0.1.0.dist-info/RECORD +16 -0
- data_engineering_exp-0.1.0.dist-info/WHEEL +4 -0
- data_engineering_exp-0.1.0.dist-info/entry_points.txt +3 -0
|
File without changes
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Command Line Interface (CLI) entry point for dex using Click."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from data_engineering_exp.core.initializer import ProjectInitializer
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@click.group()
|
|
11
|
+
def entry_point() -> None:
|
|
12
|
+
"""dex - Data Engineering Experience CLI utilities.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
None: Main CLI entry group.
|
|
18
|
+
"""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@entry_point.command()
|
|
23
|
+
@click.option(
|
|
24
|
+
"--path",
|
|
25
|
+
type=click.Path(file_okay=False, dir_okay=True),
|
|
26
|
+
default=".",
|
|
27
|
+
help="Root directory where scaffolding will be built. Defaults to '.'",
|
|
28
|
+
)
|
|
29
|
+
def init(path: str) -> None:
|
|
30
|
+
"""Scaffold a new data engineering project structure interactively.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
path(str): Target directory string path for project structure.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
None: Side-effect creating folders and configurations.
|
|
37
|
+
"""
|
|
38
|
+
click.echo("š Welcome to the dex project initialization wizard!\n")
|
|
39
|
+
|
|
40
|
+
# Derives default project name dynamically from the target directory name
|
|
41
|
+
default_name = os.path.basename(os.path.abspath(path))
|
|
42
|
+
if not default_name or default_name in (".", ""):
|
|
43
|
+
default_name = "my-dex-project"
|
|
44
|
+
|
|
45
|
+
# Sequential metadata prompts with default fallbacks
|
|
46
|
+
name = click.prompt("Project name", default=default_name, type=str)
|
|
47
|
+
version = click.prompt("Version", default="0.1.0", type=str)
|
|
48
|
+
description = click.prompt(
|
|
49
|
+
"Description",
|
|
50
|
+
default="Data engineering project scaffolded by dex",
|
|
51
|
+
type=str,
|
|
52
|
+
)
|
|
53
|
+
author = click.prompt("Author", default="Anonymous", type=str)
|
|
54
|
+
|
|
55
|
+
initializer = ProjectInitializer(base_path=path)
|
|
56
|
+
initializer.init_project(
|
|
57
|
+
name=name,
|
|
58
|
+
version=version,
|
|
59
|
+
description=description,
|
|
60
|
+
author=author,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
click.echo(f"\n⨠Project successfully initialized at '{path}'!")
|
|
File without changes
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""This module implements decentralized data catalog engines for dex."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Dict, List, Optional, cast
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DataCatalog:
|
|
10
|
+
"""Parses and consolidates declarative configurations from files or directories.
|
|
11
|
+
|
|
12
|
+
Scans catalog definitions recursively, allowing user architectures to be
|
|
13
|
+
split into mini-YAML files or kept within a single configuration layout.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, catalog_path: Optional[str] = None) -> None:
|
|
17
|
+
"""Initializes the DataCatalog by scanning files or configuration paths.
|
|
18
|
+
|
|
19
|
+
If no path is provided, it automatically seeks upwards from the current
|
|
20
|
+
working directory to find the standard 'conf/catalog' directory based on
|
|
21
|
+
the pyproject.toml root anchor file.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
catalog_path(Optional[str]): System directory path or file string
|
|
25
|
+
pointing to catalog definitions. Defaults to None.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
None: Initializes the class instance.
|
|
29
|
+
"""
|
|
30
|
+
self.project_root: str = ""
|
|
31
|
+
|
|
32
|
+
if catalog_path is None:
|
|
33
|
+
catalog_path = self._discover_catalog_path()
|
|
34
|
+
else:
|
|
35
|
+
# Derive the root directory context if an explicit path is supplied
|
|
36
|
+
abs_path = os.path.abspath(catalog_path)
|
|
37
|
+
if os.path.isfile(abs_path):
|
|
38
|
+
self.project_root = os.path.dirname(
|
|
39
|
+
os.path.dirname(os.path.dirname(abs_path))
|
|
40
|
+
)
|
|
41
|
+
else:
|
|
42
|
+
self.project_root = os.path.dirname(os.path.dirname(abs_path))
|
|
43
|
+
|
|
44
|
+
if not os.path.exists(catalog_path):
|
|
45
|
+
raise FileNotFoundError(f"Catalog source path not found at: {catalog_path}")
|
|
46
|
+
|
|
47
|
+
self._datasets: Dict[str, Dict[str, Any]] = {}
|
|
48
|
+
self._load_catalog_sources(catalog_path)
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def dataset_names(self) -> List[str]:
|
|
52
|
+
"""Exposes the list of all dataset identifiers present in the catalog.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
List[str]: List of string names for all loaded datasets.
|
|
58
|
+
"""
|
|
59
|
+
return list(self._datasets.keys())
|
|
60
|
+
|
|
61
|
+
def get_dataset_metadata(self, dataset_name: str) -> Dict[str, Any]:
|
|
62
|
+
"""Retrieves core metadata mapping configurations for a target dataset.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
dataset_name(str): Unique identifier string of the dataset.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Dict[str, Any]: Dictionary containing properties like description,
|
|
69
|
+
format, primary_keys, and storage_path.
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
KeyError: If the target dataset name is missing from the catalog.
|
|
73
|
+
"""
|
|
74
|
+
if dataset_name not in self._datasets:
|
|
75
|
+
raise KeyError(f"Dataset '{dataset_name}' missing from catalog.")
|
|
76
|
+
|
|
77
|
+
meta = self._datasets[dataset_name].copy()
|
|
78
|
+
meta.pop("columns", None)
|
|
79
|
+
return meta
|
|
80
|
+
|
|
81
|
+
def get_column_names(self, dataset_name: str) -> List[str]:
|
|
82
|
+
"""Extracts the expected list of column names for a target dataset.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
dataset_name(str): Unique identifier string of the dataset.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
List[str]: Ordered list of strings representing expected columns.
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
KeyError: If the target dataset name is missing from the catalog.
|
|
92
|
+
"""
|
|
93
|
+
if dataset_name not in self._datasets:
|
|
94
|
+
raise KeyError(f"Dataset '{dataset_name}' missing from catalog.")
|
|
95
|
+
|
|
96
|
+
columns_spec = self._datasets[dataset_name].get("columns", [])
|
|
97
|
+
return [col["name"] for col in columns_spec]
|
|
98
|
+
|
|
99
|
+
def validate_schema_presence(self, df: Any, dataset_name: str) -> bool:
|
|
100
|
+
"""Validates that all columns defined in the catalog exist in the dataframe.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
df(Any): Active input Pandas or PySpark DataFrame object.
|
|
104
|
+
dataset_name(str): Catalog identifier string of the dataset target.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
bool: True if all expected columns are present, raises ValueError
|
|
108
|
+
otherwise.
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
TypeError: If the input dataframe structure is not supported.
|
|
112
|
+
ValueError: If a column mismatch is discovered against the catalog.
|
|
113
|
+
"""
|
|
114
|
+
expected_cols = set(self.get_column_names(dataset_name))
|
|
115
|
+
|
|
116
|
+
df_type = type(df).__name__
|
|
117
|
+
if df_type == "DataFrame" and "pandas" in type(df).__module__:
|
|
118
|
+
actual_cols = set(df.columns.tolist())
|
|
119
|
+
elif df_type == "DataFrame" and "pyspark" in type(df).__module__:
|
|
120
|
+
actual_cols = set(df.columns)
|
|
121
|
+
else:
|
|
122
|
+
raise TypeError("Unsupported DataFrame type for validation.")
|
|
123
|
+
|
|
124
|
+
missing_cols = expected_cols - actual_cols
|
|
125
|
+
if missing_cols:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"Schema mismatch for '{dataset_name}'. "
|
|
128
|
+
f"Missing expected catalog columns: {list(missing_cols)}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return True
|
|
132
|
+
|
|
133
|
+
def _discover_catalog_path(self) -> str:
|
|
134
|
+
"""Traverses parent directories upwards to locate the pyproject.toml file.
|
|
135
|
+
|
|
136
|
+
Once the layout anchor file is found, it automatically appends the standard
|
|
137
|
+
'conf/catalog' structural workspace mapping.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
str: Absolute path to the discovered catalog directory.
|
|
141
|
+
|
|
142
|
+
Raises:
|
|
143
|
+
FileNotFoundError: If the pyproject.toml configuration cannot be found.
|
|
144
|
+
"""
|
|
145
|
+
current_dir = os.getcwd()
|
|
146
|
+
|
|
147
|
+
while True:
|
|
148
|
+
potential_toml = os.path.join(current_dir, "pyproject.toml")
|
|
149
|
+
if os.path.exists(potential_toml):
|
|
150
|
+
self.project_root = current_dir
|
|
151
|
+
return os.path.join(current_dir, "conf", "catalog")
|
|
152
|
+
|
|
153
|
+
parent_dir = os.path.dirname(current_dir)
|
|
154
|
+
if parent_dir == current_dir:
|
|
155
|
+
break
|
|
156
|
+
current_dir = parent_dir
|
|
157
|
+
|
|
158
|
+
raise FileNotFoundError(
|
|
159
|
+
"Configuration file (pyproject.toml) could not be located. "
|
|
160
|
+
"Please run 'dex init' to establish a valid project layout."
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _load_catalog_sources(self, path: str) -> None:
|
|
164
|
+
"""Internal worker to process and parse path targets recursively.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
path(str): Targeting file path or folder configuration directory.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
None: Populates internal dictionary instances.
|
|
171
|
+
"""
|
|
172
|
+
if os.path.isfile(path):
|
|
173
|
+
if path.endswith((".yml", ".yaml")):
|
|
174
|
+
self._parse_file(path)
|
|
175
|
+
return
|
|
176
|
+
|
|
177
|
+
for root, _, files in os.walk(path):
|
|
178
|
+
for file in files:
|
|
179
|
+
if file.endswith((".yml", ".yaml")):
|
|
180
|
+
full_path = os.path.join(root, file)
|
|
181
|
+
self._parse_file(full_path)
|
|
182
|
+
|
|
183
|
+
def _parse_file(self, file_path: str) -> None:
|
|
184
|
+
"""Parses an individual YAML catalog file and updates target states.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
file_path(str): Exact system string location pointing to file.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
None: Updates the core datasets dictionary data states.
|
|
191
|
+
"""
|
|
192
|
+
with open(file_path, "r", encoding="utf-8") as stream:
|
|
193
|
+
content = yaml.safe_load(stream)
|
|
194
|
+
|
|
195
|
+
if content and isinstance(content, dict):
|
|
196
|
+
typed_content = cast(Dict[str, Dict[str, Any]], content)
|
|
197
|
+
self._datasets.update(typed_content)
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""This module implements versatile data deduplication utilities for dex."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
# Handle optional dependencies at the top of the file
|
|
6
|
+
try:
|
|
7
|
+
import pyspark.sql.functions as F
|
|
8
|
+
from pyspark.sql import Window
|
|
9
|
+
|
|
10
|
+
HAS_SPARK = True
|
|
11
|
+
except ImportError:
|
|
12
|
+
Window: Any = None
|
|
13
|
+
F: Any = None
|
|
14
|
+
HAS_SPARK = False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Deduplicator:
|
|
18
|
+
"""Utility class providing multiple strategies for data deduplication.
|
|
19
|
+
|
|
20
|
+
Supports Pandas and PySpark DataFrames dynamically, allowing extraction of
|
|
21
|
+
first, latest, multi-sorted, consolidated, or strictly unique records.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self):
|
|
25
|
+
"""Initializes the Deduplicator instance.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
None: Initializes the class instance.
|
|
31
|
+
"""
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
def latest(self, df: Any, keys: List[str], order_by_col: str) -> Any:
|
|
35
|
+
"""Extracts the most recent record per business key group.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
df(Any): Input Pandas or PySpark DataFrame.
|
|
39
|
+
keys(List[str]): Columns that identify a business entity.
|
|
40
|
+
order_by_col(str): Column used to determine chronological order.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Any: Deduplicated DataFrame containing only the latest events.
|
|
44
|
+
"""
|
|
45
|
+
framework = self._detect_framework(df)
|
|
46
|
+
if framework == "pandas":
|
|
47
|
+
return (
|
|
48
|
+
df.sort_values(by=order_by_col, ascending=True)
|
|
49
|
+
.groupby(keys)
|
|
50
|
+
.last()
|
|
51
|
+
.reset_index()
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
window_spec = Window.partitionBy(keys).orderBy(F.col(order_by_col).desc())
|
|
55
|
+
return (
|
|
56
|
+
df.withColumn("_row_num", F.row_number().over(window_spec))
|
|
57
|
+
.filter(F.col("_row_num") == 1)
|
|
58
|
+
.drop("_row_num")
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def first(self, df: Any, keys: List[str], order_by_col: str) -> Any:
|
|
62
|
+
"""Extracts the earliest chronological record per business key group.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
df(Any): Input Pandas or PySpark DataFrame.
|
|
66
|
+
keys(List[str]): Columns that identify a business entity.
|
|
67
|
+
order_by_col(str): Column used to determine chronological order.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Any: Deduplicated DataFrame containing only the first events.
|
|
71
|
+
"""
|
|
72
|
+
framework = self._detect_framework(df)
|
|
73
|
+
if framework == "pandas":
|
|
74
|
+
return (
|
|
75
|
+
df.sort_values(by=order_by_col, ascending=True)
|
|
76
|
+
.groupby(keys)
|
|
77
|
+
.first()
|
|
78
|
+
.reset_index()
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
window_spec = Window.partitionBy(keys).orderBy(F.col(order_by_col).asc())
|
|
82
|
+
return (
|
|
83
|
+
df.withColumn("_row_num", F.row_number().over(window_spec))
|
|
84
|
+
.filter(F.col("_row_num") == 1)
|
|
85
|
+
.drop("_row_num")
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def distinct(self, df: Any, subset: Optional[List[str]] = None) -> Any:
|
|
89
|
+
"""Removes strict duplicate rows from the DataFrame.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
df(Any): Input Pandas or PySpark DataFrame.
|
|
93
|
+
subset(Optional[List[str]]): Specific columns to consider when
|
|
94
|
+
identifying duplicates. Defaults to None (all columns).
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Any: Clean DataFrame with unique records only.
|
|
98
|
+
"""
|
|
99
|
+
framework = self._detect_framework(df)
|
|
100
|
+
if framework == "pandas":
|
|
101
|
+
return df.drop_duplicates(subset=subset).reset_index(drop=True)
|
|
102
|
+
|
|
103
|
+
return df.dropDuplicates(subset=subset)
|
|
104
|
+
|
|
105
|
+
def by_order(
|
|
106
|
+
self,
|
|
107
|
+
df: Any,
|
|
108
|
+
keys: List[str],
|
|
109
|
+
order_by_cols: List[str],
|
|
110
|
+
ascending: Union[bool, List[bool]] = True,
|
|
111
|
+
) -> Any:
|
|
112
|
+
"""Deduplicates rows by sorting through multiple custom column criteria.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
df(Any): Input Pandas or PySpark DataFrame.
|
|
116
|
+
keys(List[str]): Columns that identify a business entity.
|
|
117
|
+
order_by_cols(List[str]): Ordered list of columns to sort by.
|
|
118
|
+
ascending(Union[bool, List[bool]]): Type of sorting direction for
|
|
119
|
+
each column. Defaults to True.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Any: Deduplicated DataFrame containing the top-ranked records.
|
|
123
|
+
"""
|
|
124
|
+
framework = self._detect_framework(df)
|
|
125
|
+
if framework == "pandas":
|
|
126
|
+
return (
|
|
127
|
+
df.sort_values(by=order_by_cols, ascending=ascending)
|
|
128
|
+
.groupby(keys)
|
|
129
|
+
.first()
|
|
130
|
+
.reset_index()
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
if isinstance(ascending, bool):
|
|
134
|
+
asc_list = [ascending] * len(order_by_cols)
|
|
135
|
+
else:
|
|
136
|
+
asc_list = ascending
|
|
137
|
+
|
|
138
|
+
order_exprs = []
|
|
139
|
+
for col, asc_dir in zip(order_by_cols, asc_list):
|
|
140
|
+
expr = F.col(col).asc() if asc_dir else F.col(col).desc()
|
|
141
|
+
order_exprs.append(expr)
|
|
142
|
+
|
|
143
|
+
window_spec = Window.partitionBy(keys).orderBy(*order_exprs)
|
|
144
|
+
return (
|
|
145
|
+
df.withColumn("_row_num", F.row_number().over(window_spec))
|
|
146
|
+
.filter(F.col("_row_num") == 1)
|
|
147
|
+
.drop("_row_num")
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def combined(self, df: Any, keys: List[str]) -> Any:
|
|
151
|
+
"""Stitches rows together by compacting nulls into a golden record.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
df(Any): Input Pandas or PySpark DataFrame.
|
|
155
|
+
keys(List[str]): Columns that identify a business entity.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Any: Consolidated DataFrame with filled attributes per group.
|
|
159
|
+
"""
|
|
160
|
+
framework = self._detect_framework(df)
|
|
161
|
+
if framework == "pandas":
|
|
162
|
+
return df.groupby(keys).first().reset_index()
|
|
163
|
+
|
|
164
|
+
agg_cols = [
|
|
165
|
+
F.first(c, ignorenulls=True).alias(c) for c in df.columns if c not in keys
|
|
166
|
+
]
|
|
167
|
+
return df.groupBy(keys).agg(*agg_cols)
|
|
168
|
+
|
|
169
|
+
def _detect_framework(self, df: Any) -> str:
|
|
170
|
+
"""Internal helper to identify the dataframe framework type safely.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
df(Any): Input dataframe object.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
str: String identifier ("pandas" or "spark").
|
|
177
|
+
"""
|
|
178
|
+
df_type = type(df).__name__
|
|
179
|
+
module_type = type(df).__module__
|
|
180
|
+
|
|
181
|
+
if "pandas" in module_type and df_type == "DataFrame":
|
|
182
|
+
return "pandas"
|
|
183
|
+
|
|
184
|
+
if "pyspark" in module_type and df_type == "DataFrame":
|
|
185
|
+
if not HAS_SPARK:
|
|
186
|
+
raise ImportError("PySpark is required but could not be imported.")
|
|
187
|
+
return "spark"
|
|
188
|
+
|
|
189
|
+
raise TypeError(
|
|
190
|
+
f"Unsupported type: {type(df)}. "
|
|
191
|
+
"Only Pandas and PySpark DataFrames are supported."
|
|
192
|
+
)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""This module implements the project structural initialization engine for dex."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ProjectInitializer:
|
|
7
|
+
"""Handles scaffolding of directory structures for new data engineering setups.
|
|
8
|
+
|
|
9
|
+
Automates the creation of standard configuration folders and source directories
|
|
10
|
+
to establish a unified project layout.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, base_path: str = ".") -> None:
|
|
14
|
+
"""Initializes the ProjectInitializer with a target base path.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
base_path(str): Root system path where scaffolding will be built.
|
|
18
|
+
Defaults to ".".
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
None: Initializes the class instance.
|
|
22
|
+
"""
|
|
23
|
+
self.base_path = base_path
|
|
24
|
+
|
|
25
|
+
def init_project(
|
|
26
|
+
self,
|
|
27
|
+
name: str = "my-dex-project",
|
|
28
|
+
version: str = "0.1.0",
|
|
29
|
+
description: str = "Data engineering project scaffolded by dex",
|
|
30
|
+
author: str = "Anonymous",
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Scaffolds the required folders and creates baseline configuration templates.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
name(str): Operational name of the data project. Defaults to
|
|
36
|
+
"my-dex-project".
|
|
37
|
+
version(str): Initial semver string version. Defaults to "0.1.0".
|
|
38
|
+
description(str): Short purpose statement describing the repository.
|
|
39
|
+
Defaults to "Data engineering project scaffolded by dex".
|
|
40
|
+
author(str): Full name or alias identifier of the creator. Defaults
|
|
41
|
+
to "Anonymous".
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
None: Side-effect function creating disk structures.
|
|
45
|
+
"""
|
|
46
|
+
folders = [
|
|
47
|
+
os.path.join(self.base_path, "conf", "catalog"),
|
|
48
|
+
os.path.join(self.base_path, "src", "notebooks"),
|
|
49
|
+
os.path.join(self.base_path, "data"),
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
for folder in folders:
|
|
53
|
+
os.makedirs(folder, exist_ok=True)
|
|
54
|
+
|
|
55
|
+
# Create a clean, standard pyproject.toml as the anchor of the project
|
|
56
|
+
toml_path = os.path.join(self.base_path, "pyproject.toml")
|
|
57
|
+
toml_content = (
|
|
58
|
+
"[project]\n"
|
|
59
|
+
f'name = "{name}"\n'
|
|
60
|
+
f'version = "{version}"\n'
|
|
61
|
+
f'description = "{description}"\n'
|
|
62
|
+
"authors = [\n"
|
|
63
|
+
f' {{name = "{author}"}}\n'
|
|
64
|
+
"]\n"
|
|
65
|
+
'requires-python = ">=3.11,<4.0.0"\n'
|
|
66
|
+
)
|
|
67
|
+
with open(toml_path, "w", encoding="utf-8") as file:
|
|
68
|
+
file.write(toml_content)
|
|
69
|
+
|
|
70
|
+
# Drop a real physical sample CSV file into the new data folder
|
|
71
|
+
csv_path = os.path.join(self.base_path, "data", "sample_table.csv")
|
|
72
|
+
csv_content = "id,name\n1,Alice\n2,Bob\n"
|
|
73
|
+
with open(csv_path, "w", encoding="utf-8") as csv_file:
|
|
74
|
+
csv_file.write(csv_content)
|
|
75
|
+
|
|
76
|
+
# Inject a sample boilerplate YAML catalog pointing to the real CSV
|
|
77
|
+
sample_catalog_path = os.path.join(
|
|
78
|
+
self.base_path, "conf", "catalog", "sample_dataset.yaml"
|
|
79
|
+
)
|
|
80
|
+
sample_content = (
|
|
81
|
+
"sample_table:\n"
|
|
82
|
+
" description: 'Boilerplate example dataset created by dex'\n"
|
|
83
|
+
" format: 'csv'\n"
|
|
84
|
+
" engine: 'pandas'\n"
|
|
85
|
+
" storage_path: 'data/sample_table.csv'\n"
|
|
86
|
+
" columns:\n"
|
|
87
|
+
" - name: 'id'\n"
|
|
88
|
+
" type: 'integer'\n"
|
|
89
|
+
" - name: 'name'\n"
|
|
90
|
+
" type: 'string'\n"
|
|
91
|
+
)
|
|
92
|
+
with open(sample_catalog_path, "w", encoding="utf-8") as file:
|
|
93
|
+
file.write(sample_content)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""This module implements metadata-driven data loading utilities for dex."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Optional, cast
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from data_engineering_exp.core.catalog import DataCatalog
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from pyspark.sql import SparkSession
|
|
12
|
+
|
|
13
|
+
HAS_SPARK = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
SparkSession = cast(Any, None)
|
|
16
|
+
HAS_SPARK = False
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DataLoader:
|
|
20
|
+
"""Handles dynamic loading of datasets using catalog metadata definitions.
|
|
21
|
+
|
|
22
|
+
Supports loading data via Pandas or PySpark based on the specified
|
|
23
|
+
execution engine and format configurations.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, catalog: Optional[DataCatalog] = None) -> None:
|
|
27
|
+
"""Initializes the DataLoader with a DataCatalog instance.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
catalog(Optional[DataCatalog]): An instance of DataCatalog. If
|
|
31
|
+
None, a new instance is automatically discovered.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
None: Initializes the class instance.
|
|
35
|
+
"""
|
|
36
|
+
self.catalog = catalog if catalog is not None else DataCatalog()
|
|
37
|
+
|
|
38
|
+
def load(self, dataset_name: str, spark: Optional[Any] = None) -> Any:
|
|
39
|
+
"""Loads a dataset from storage based on its catalog specification.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
dataset_name(str): Unique identifier string of the dataset.
|
|
43
|
+
spark(Optional[Any]): Active PySpark SparkSession instance. Required
|
|
44
|
+
if engine is 'spark' and no active session is globally found.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Any: Loaded Pandas or PySpark DataFrame object.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
KeyError: If mandatory keys like 'engine', 'format' or
|
|
51
|
+
'storage_path' are missing from metadata.
|
|
52
|
+
ValueError: If an unsupported engine or format is provided.
|
|
53
|
+
ImportError: If the PySpark engine is requested but not installed.
|
|
54
|
+
"""
|
|
55
|
+
meta = self.catalog.get_dataset_metadata(dataset_name)
|
|
56
|
+
|
|
57
|
+
engine = meta.get("engine")
|
|
58
|
+
data_format = meta.get("format")
|
|
59
|
+
path = meta.get("storage_path")
|
|
60
|
+
|
|
61
|
+
if not engine or not data_format or not path:
|
|
62
|
+
raise KeyError(
|
|
63
|
+
f"Dataset '{dataset_name}' metadata must contain "
|
|
64
|
+
f"'engine', 'format', and 'storage_path'."
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Convention resolution: anchor relative paths safely to project root
|
|
68
|
+
if not os.path.isabs(path):
|
|
69
|
+
path = os.path.normpath(os.path.join(self.catalog.project_root, path))
|
|
70
|
+
|
|
71
|
+
if engine == "pandas":
|
|
72
|
+
return self._load_with_pandas(path, data_format)
|
|
73
|
+
elif engine == "spark":
|
|
74
|
+
return self._load_with_spark(path, data_format, spark)
|
|
75
|
+
else:
|
|
76
|
+
raise ValueError(f"Unsupported execution engine: '{engine}'.")
|
|
77
|
+
|
|
78
|
+
def _load_with_pandas(self, path: str, data_format: str) -> pd.DataFrame:
|
|
79
|
+
"""Internal helper to route reading operations to Pandas.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
path(str): Target system string file location path.
|
|
83
|
+
data_format(str): File format layout specification identifier.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
pd.DataFrame: Loaded Pandas DataFrame object.
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
ValueError: If the file format layout is unsupported by Pandas.
|
|
90
|
+
"""
|
|
91
|
+
if data_format == "parquet":
|
|
92
|
+
return pd.read_parquet(path)
|
|
93
|
+
elif data_format == "csv":
|
|
94
|
+
return pd.read_csv(path)
|
|
95
|
+
else:
|
|
96
|
+
raise ValueError(f"Unsupported Pandas format: '{data_format}'.")
|
|
97
|
+
|
|
98
|
+
def _load_with_spark(
|
|
99
|
+
self, path: str, data_format: str, spark: Optional[Any]
|
|
100
|
+
) -> Any:
|
|
101
|
+
"""Internal worker to route reading operations to PySpark.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
path(str): Target system string file location path.
|
|
105
|
+
data_format(str): File format layout specification identifier.
|
|
106
|
+
spark(Optional[Any]): Explicit user supplied SparkSession object.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Any: Loaded PySpark DataFrame object.
|
|
110
|
+
|
|
111
|
+
Raises:
|
|
112
|
+
ImportError: If the PySpark dependency library is uninstalled.
|
|
113
|
+
ValueError: If no valid operational Spark session context is found
|
|
114
|
+
or if the layout format is unsupported.
|
|
115
|
+
"""
|
|
116
|
+
if not HAS_SPARK:
|
|
117
|
+
raise ImportError("PySpark is required but could not be imported.")
|
|
118
|
+
|
|
119
|
+
session = spark
|
|
120
|
+
if session is None:
|
|
121
|
+
session = SparkSession.getActiveSession()
|
|
122
|
+
|
|
123
|
+
if session is None:
|
|
124
|
+
raise ValueError(
|
|
125
|
+
"A valid SparkSession must be provided or active globally "
|
|
126
|
+
"to load data using the 'spark' engine."
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if data_format == "parquet":
|
|
130
|
+
return session.read.parquet(path)
|
|
131
|
+
elif data_format == "csv":
|
|
132
|
+
return session.read.csv(path, header=True, inferSchema=True)
|
|
133
|
+
else:
|
|
134
|
+
raise ValueError(f"Unsupported Spark format: '{data_format}'.")
|
|
File without changes
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""This module implements Slowly Changing Dimension Type 2 (SCD2) logic using Pandas."""
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PandasSCD2Processor:
|
|
9
|
+
"""Handles Slowly Changing Dimension Type 2 (SCD2) logic using Pandas.
|
|
10
|
+
|
|
11
|
+
This class encapsulates configuration parameters such as keys and metadata
|
|
12
|
+
columns, allowing clean and repeatable executions of SCD2 updates.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
business_keys: List[str],
|
|
18
|
+
compare_columns: List[str],
|
|
19
|
+
effective_date_col: str = "effective_date",
|
|
20
|
+
end_date_col: str = "end_date",
|
|
21
|
+
current_flag_col: str = "is_current",
|
|
22
|
+
input_timestamp_col: str = "input_timestamp",
|
|
23
|
+
max_date: str = "9999-12-31",
|
|
24
|
+
):
|
|
25
|
+
"""Initializes the PandasSCD2Processor with configuration parameters.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
business_keys(List[str]): List of columns that uniquely identify
|
|
29
|
+
a record.
|
|
30
|
+
compare_columns(List[str]): List of columns used to detect
|
|
31
|
+
changes in the data.
|
|
32
|
+
effective_date_col(str): Name of the effective start date column.
|
|
33
|
+
Defaults to "effective_date".
|
|
34
|
+
end_date_col(str): Name of the effective end date column.
|
|
35
|
+
Defaults to "end_date".
|
|
36
|
+
current_flag_col(str): Name of the boolean flag column
|
|
37
|
+
indicating the active record. Defaults to "is_current".
|
|
38
|
+
input_timestamp_col(str): Name of the column containing the
|
|
39
|
+
ingestion/mutation timestamp. Defaults to "input_timestamp".
|
|
40
|
+
max_date(str): The maximum date string used to populate active
|
|
41
|
+
records. Defaults to "9999-12-31".
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
None: This method initializes the class instance.
|
|
45
|
+
"""
|
|
46
|
+
self.business_keys = business_keys
|
|
47
|
+
self.compare_columns = compare_columns
|
|
48
|
+
self.effective_date_col = effective_date_col
|
|
49
|
+
self.end_date_col = end_date_col
|
|
50
|
+
self.current_flag_col = current_flag_col
|
|
51
|
+
self.input_timestamp_col = input_timestamp_col
|
|
52
|
+
self.max_date = max_date
|
|
53
|
+
|
|
54
|
+
def process(
|
|
55
|
+
self, df_new: pd.DataFrame, df_existing: pd.DataFrame
|
|
56
|
+
) -> pd.DataFrame:
|
|
57
|
+
"""Orchestrates the SCD2 pipeline process and returns result.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
df_new(pd.DataFrame): Incoming batch of data.
|
|
61
|
+
df_existing(pd.DataFrame): Current state of dimension.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
pd.DataFrame: Unified DataFrame with all processed records.
|
|
65
|
+
"""
|
|
66
|
+
if not self.compare_columns:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"compare_columns cannot be empty for SCD2 processing."
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Filtrar solo los registros activos de la dimensión histórica
|
|
72
|
+
# Avoid equality comparison to True; use truthiness check instead
|
|
73
|
+
df_active = df_existing[df_existing[self.current_flag_col]]
|
|
74
|
+
|
|
75
|
+
# Indexar por llaves de negocio para aprovechar alineación de Pandas
|
|
76
|
+
df_exist_idx = df_active.set_index(self.business_keys)
|
|
77
|
+
df_new_idx = df_new.set_index(self.business_keys)
|
|
78
|
+
|
|
79
|
+
# Identificar segmentos por llaves utilizando operaciones de conjuntos
|
|
80
|
+
new_keys = df_new_idx.index.difference(df_exist_idx.index)
|
|
81
|
+
common_keys = df_new_idx.index.intersection(df_exist_idx.index)
|
|
82
|
+
preserved_keys = df_exist_idx.index.difference(df_new_idx.index)
|
|
83
|
+
|
|
84
|
+
# Separar registros comunes entre cambiados y no cambiados
|
|
85
|
+
changed_keys, unchanged_keys = self._split_common_keys(
|
|
86
|
+
df_new_idx, df_exist_idx, common_keys
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Construir cada segmento del SCD2
|
|
90
|
+
df_new_only = self._get_new_records(df_new_idx, new_keys)
|
|
91
|
+
df_changed = self._get_changed_records(df_new_idx, changed_keys)
|
|
92
|
+
df_expired = self._get_expired_records(
|
|
93
|
+
df_new_idx, df_exist_idx, changed_keys
|
|
94
|
+
)
|
|
95
|
+
df_unchanged = self._get_unchanged_records(
|
|
96
|
+
df_exist_idx, unchanged_keys
|
|
97
|
+
)
|
|
98
|
+
df_preserved = self._get_preserved_records(
|
|
99
|
+
df_exist_idx, preserved_keys
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Consolidar y ordenar columnas segĆŗn el esquema original
|
|
103
|
+
target_cols = df_existing.columns.tolist()
|
|
104
|
+
return pd.concat(
|
|
105
|
+
[df_new_only, df_changed, df_expired, df_unchanged, df_preserved],
|
|
106
|
+
ignore_index=True,
|
|
107
|
+
)[target_cols]
|
|
108
|
+
|
|
109
|
+
def _split_common_keys(
|
|
110
|
+
self,
|
|
111
|
+
df_new_idx: pd.DataFrame,
|
|
112
|
+
df_exist_idx: pd.DataFrame,
|
|
113
|
+
common_keys: pd.Index,
|
|
114
|
+
) -> tuple:
|
|
115
|
+
"""Splits common keys into changed and unchanged categories.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
df_new_idx(pd.DataFrame): New data indexed by business keys.
|
|
119
|
+
df_exist_idx(pd.DataFrame): Active existing data indexed by keys.
|
|
120
|
+
common_keys(pd.Index): Intersection of business keys.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
tuple: A tuple containing (changed_keys, unchanged_keys).
|
|
124
|
+
"""
|
|
125
|
+
df_new_common = df_new_idx.loc[common_keys]
|
|
126
|
+
df_exist_common = df_exist_idx.loc[common_keys]
|
|
127
|
+
|
|
128
|
+
change_mask = pd.Series(False, index=common_keys)
|
|
129
|
+
for col in self.compare_columns:
|
|
130
|
+
change_mask |= df_new_common[col] != df_exist_common[col]
|
|
131
|
+
|
|
132
|
+
return common_keys[change_mask], common_keys[~change_mask]
|
|
133
|
+
|
|
134
|
+
def _get_new_records(
|
|
135
|
+
self, df_new_idx: pd.DataFrame, new_keys: pd.Index
|
|
136
|
+
) -> pd.DataFrame:
|
|
137
|
+
"""Extracts and formats brand new records.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
df_new_idx(pd.DataFrame): New data indexed by business keys.
|
|
141
|
+
new_keys(pd.Index): Keys that only exist in the new batch.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
pd.DataFrame: Formatted dataframe for new insertions.
|
|
145
|
+
"""
|
|
146
|
+
df_new_only = df_new_idx.loc[new_keys].reset_index()
|
|
147
|
+
df_new_only[self.effective_date_col] = df_new_only[
|
|
148
|
+
self.input_timestamp_col
|
|
149
|
+
]
|
|
150
|
+
df_new_only[self.end_date_col] = self.max_date
|
|
151
|
+
df_new_only[self.current_flag_col] = True
|
|
152
|
+
return df_new_only
|
|
153
|
+
|
|
154
|
+
def _get_changed_records(
|
|
155
|
+
self, df_new_idx: pd.DataFrame, changed_keys: pd.Index
|
|
156
|
+
) -> pd.DataFrame:
|
|
157
|
+
"""Extracts and formats new active versions of changed records.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
df_new_idx(pd.DataFrame): New data indexed by business keys.
|
|
161
|
+
changed_keys(pd.Index): Keys with detected attribute modifications.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
pd.DataFrame: Formatted dataframe for updated active rows.
|
|
165
|
+
"""
|
|
166
|
+
df_changed = df_new_idx.loc[changed_keys].reset_index()
|
|
167
|
+
df_changed[self.effective_date_col] = df_changed[
|
|
168
|
+
self.input_timestamp_col
|
|
169
|
+
]
|
|
170
|
+
df_changed[self.end_date_col] = self.max_date
|
|
171
|
+
df_changed[self.current_flag_col] = True
|
|
172
|
+
return df_changed
|
|
173
|
+
|
|
174
|
+
def _get_expired_records(
|
|
175
|
+
self,
|
|
176
|
+
df_new_idx: pd.DataFrame,
|
|
177
|
+
df_exist_idx: pd.DataFrame,
|
|
178
|
+
changed_keys: pd.Index,
|
|
179
|
+
) -> pd.DataFrame:
|
|
180
|
+
"""Closes historical records by updating end date and flag.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
df_new_idx(pd.DataFrame): New data indexed by business keys.
|
|
184
|
+
df_exist_idx(pd.DataFrame): Active existing data indexed by keys.
|
|
185
|
+
changed_keys(pd.Index): Keys with detected attribute modifications.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
pd.DataFrame: Formatted dataframe for expired historical rows.
|
|
189
|
+
"""
|
|
190
|
+
df_expired = df_exist_idx.loc[changed_keys].reset_index()
|
|
191
|
+
closing_timestamps = df_new_idx.loc[
|
|
192
|
+
changed_keys, self.input_timestamp_col
|
|
193
|
+
].values
|
|
194
|
+
|
|
195
|
+
df_expired[self.end_date_col] = closing_timestamps
|
|
196
|
+
df_expired[self.current_flag_col] = False
|
|
197
|
+
return df_expired
|
|
198
|
+
|
|
199
|
+
def _get_unchanged_records(
|
|
200
|
+
self, df_exist_idx: pd.DataFrame, unchanged_keys: pd.Index
|
|
201
|
+
) -> pd.DataFrame:
|
|
202
|
+
"""Extracts active records that remain identical.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
df_exist_idx(pd.DataFrame): Active existing data indexed by keys.
|
|
206
|
+
unchanged_keys(pd.Index): Keys with no attribute modifications.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
pd.DataFrame: Existing active rows without modifications.
|
|
210
|
+
"""
|
|
211
|
+
return df_exist_idx.loc[unchanged_keys].reset_index()
|
|
212
|
+
|
|
213
|
+
def _get_preserved_records(
|
|
214
|
+
self, df_exist_idx: pd.DataFrame, preserved_keys: pd.Index
|
|
215
|
+
) -> pd.DataFrame:
|
|
216
|
+
"""Extracts active records completely missing from the incoming batch.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
df_exist_idx(pd.DataFrame): Active existing data indexed by keys.
|
|
220
|
+
preserved_keys(pd.Index): Keys missing from the new batch.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
pd.DataFrame: Existing active rows that must be preserved.
|
|
224
|
+
"""
|
|
225
|
+
return df_exist_idx.loc[preserved_keys].reset_index()
|
|
File without changes
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module implements Slowly Changing Dimension Type 2 (SCD2) logic using
|
|
3
|
+
PySpark.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from pyspark.sql import Column, DataFrame
|
|
9
|
+
from pyspark.sql import functions as F
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SparkSCD2Processor:
|
|
13
|
+
"""Handles Slowly Changing Dimension Type 2 (SCD2) logic using PySpark.
|
|
14
|
+
|
|
15
|
+
This class encapsulates configuration parameters such as keys and metadata
|
|
16
|
+
columns, allowing clean and repeatable executions of SCD2 updates.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
business_keys: List[str],
|
|
22
|
+
compare_columns: List[str],
|
|
23
|
+
effective_date_col: str = "effective_date",
|
|
24
|
+
end_date_col: str = "end_date",
|
|
25
|
+
current_flag_col: str = "is_current",
|
|
26
|
+
input_timestamp_col: str = "input_timestamp",
|
|
27
|
+
max_date: str = "9999-12-31",
|
|
28
|
+
):
|
|
29
|
+
"""Initializes the SparkSCD2Processor with configuration parameters.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
business_keys(List[str]): List of columns that uniquely identify
|
|
33
|
+
a record.
|
|
34
|
+
compare_columns(List[str]): List of columns used to detect
|
|
35
|
+
changes in the data.
|
|
36
|
+
effective_date_col(str): Name of the effective start date column.
|
|
37
|
+
Defaults to "effective_date".
|
|
38
|
+
end_date_col(str): Name of the effective end date column.
|
|
39
|
+
Defaults to "end_date".
|
|
40
|
+
current_flag_col(str): Name of the boolean flag column
|
|
41
|
+
indicating the active record. Defaults to "is_current".
|
|
42
|
+
input_timestamp_col(str): Name of the column containing the
|
|
43
|
+
ingestion/mutation timestamp. Defaults to "input_timestamp".
|
|
44
|
+
max_date(str): The maximum date string used to populate active
|
|
45
|
+
records. Defaults to "9999-12-31".
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
None: This method initializes the class instance.
|
|
49
|
+
"""
|
|
50
|
+
self.business_keys = business_keys
|
|
51
|
+
self.compare_columns = compare_columns
|
|
52
|
+
self.effective_date_col = effective_date_col
|
|
53
|
+
self.end_date_col = end_date_col
|
|
54
|
+
self.current_flag_col = current_flag_col
|
|
55
|
+
self.input_timestamp_col = input_timestamp_col
|
|
56
|
+
self.max_date = max_date
|
|
57
|
+
|
|
58
|
+
self._new_alias = "new"
|
|
59
|
+
self._exist_alias = "exist"
|
|
60
|
+
|
|
61
|
+
def process(
|
|
62
|
+
self, spark_df_new: DataFrame, spark_df_existing: DataFrame
|
|
63
|
+
) -> DataFrame:
|
|
64
|
+
"""Orchestrates the SCD2 pipeline process and returns result.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
spark_df_new(DataFrame): Incoming batch of data.
|
|
68
|
+
spark_df_existing(DataFrame): Current state of dimension.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
DataFrame: Unified DataFrame with all processed records.
|
|
72
|
+
"""
|
|
73
|
+
target_columns = spark_df_existing.columns
|
|
74
|
+
|
|
75
|
+
df_new_prepared = self._prepare_new_df(spark_df_new)
|
|
76
|
+
df_joined = self._join_datasets(df_new_prepared, spark_df_existing)
|
|
77
|
+
|
|
78
|
+
change_cond = self._build_change_condition()
|
|
79
|
+
unchanged_cond = self._build_unchanged_condition()
|
|
80
|
+
|
|
81
|
+
df_new_only = self._get_new_records(df_joined, target_columns)
|
|
82
|
+
df_changed = self._get_changed_records(df_joined, change_cond, target_columns)
|
|
83
|
+
df_unchanged = self._get_unchanged_records(
|
|
84
|
+
df_joined, unchanged_cond, target_columns
|
|
85
|
+
)
|
|
86
|
+
df_expired = self._get_expired_records(df_joined, change_cond, target_columns)
|
|
87
|
+
df_preserved = self._get_preserved_records(
|
|
88
|
+
spark_df_new, spark_df_existing, target_columns
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return (
|
|
92
|
+
df_new_only.unionByName(df_changed)
|
|
93
|
+
.unionByName(df_expired)
|
|
94
|
+
.unionByName(df_preserved)
|
|
95
|
+
.unionByName(df_unchanged)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _prepare_new_df(self, df: DataFrame) -> DataFrame:
|
|
99
|
+
"""Prepares the incoming new dataset by injecting SCD2 metadata.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
df(DataFrame): The raw incoming PySpark DataFrame.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
DataFrame: PySpark DataFrame with SCD2 columns added.
|
|
106
|
+
"""
|
|
107
|
+
return (
|
|
108
|
+
df.withColumn(self.effective_date_col, F.col(self.input_timestamp_col))
|
|
109
|
+
.withColumn(self.end_date_col, F.lit(self.max_date))
|
|
110
|
+
.withColumn(self.current_flag_col, F.lit(True))
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def _join_datasets(self, df_new: DataFrame, df_existing: DataFrame) -> DataFrame:
|
|
114
|
+
"""Performs a left join between new data and historical dimension.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
df_new(DataFrame): Prepared incoming DataFrame.
|
|
118
|
+
df_existing(DataFrame): Historical dimension table.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
DataFrame: Joined PySpark DataFrame.
|
|
122
|
+
"""
|
|
123
|
+
join_condition = [
|
|
124
|
+
F.col(f"{self._new_alias}.{col}") == F.col(f"{self._exist_alias}.{col}")
|
|
125
|
+
for col in self.business_keys
|
|
126
|
+
]
|
|
127
|
+
return df_new.alias(self._new_alias).join(
|
|
128
|
+
df_existing.alias(self._exist_alias),
|
|
129
|
+
on=join_condition,
|
|
130
|
+
how="left",
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def _build_change_condition(self) -> Column:
|
|
134
|
+
"""Builds expression to identify mismatches across comparison columns.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Column: PySpark Column representing the logical OR condition.
|
|
140
|
+
"""
|
|
141
|
+
if not self.compare_columns:
|
|
142
|
+
raise ValueError(
|
|
143
|
+
"compare_columns cannot be empty for SCD2 processing."
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Inicializamos con la primera columna para asegurar un tipo Column
|
|
147
|
+
first_col = self.compare_columns[0]
|
|
148
|
+
change_condition = F.col(f"{self._new_alias}.{first_col}") != F.col(
|
|
149
|
+
f"{self._exist_alias}.{first_col}"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Iteramos sobre el resto de las columnas usando el operador |= (OR)
|
|
153
|
+
for col in self.compare_columns[1:]:
|
|
154
|
+
condition = F.col(f"{self._new_alias}.{col}") != F.col(
|
|
155
|
+
f"{self._exist_alias}.{col}"
|
|
156
|
+
)
|
|
157
|
+
change_condition |= condition
|
|
158
|
+
|
|
159
|
+
return change_condition
|
|
160
|
+
|
|
161
|
+
def _build_unchanged_condition(self) -> Column:
|
|
162
|
+
"""Builds expression to verify that columns are strictly identical.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Column: PySpark Column representing the logical AND condition.
|
|
168
|
+
"""
|
|
169
|
+
unchanged_condition = F.lit(True)
|
|
170
|
+
for col in self.compare_columns:
|
|
171
|
+
unchanged_condition &= F.col(f"{self._new_alias}.{col}") == F.col(
|
|
172
|
+
f"{self._exist_alias}.{col}"
|
|
173
|
+
)
|
|
174
|
+
return unchanged_condition
|
|
175
|
+
|
|
176
|
+
def _get_new_records(self, df_joined: DataFrame, columns: List[str]) -> DataFrame:
|
|
177
|
+
"""Extracts records whose business keys do not exist in dimension.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
df_joined(DataFrame): The combined/joined PySpark DataFrame.
|
|
181
|
+
columns(List[str]): List of output columns required.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
DataFrame: PySpark DataFrame containing brand new rows.
|
|
185
|
+
"""
|
|
186
|
+
key_col = f"{self._exist_alias}.{self.business_keys[0]}"
|
|
187
|
+
select_exprs = [F.col(f"{self._new_alias}.{c}").alias(c) for c in columns]
|
|
188
|
+
return df_joined.filter(F.col(key_col).isNull()).select(select_exprs)
|
|
189
|
+
|
|
190
|
+
def _get_changed_records(
|
|
191
|
+
self, df_joined: DataFrame, change_cond: Column, columns: List[str]
|
|
192
|
+
) -> DataFrame:
|
|
193
|
+
"""Extracts newest version of records that suffered changes.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
df_joined(DataFrame): The combined/joined PySpark DataFrame.
|
|
197
|
+
change_cond(Column): PySpark Column condition for changes.
|
|
198
|
+
columns(List[str]): List of output columns required.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
DataFrame: PySpark DataFrame containing active updated records.
|
|
202
|
+
"""
|
|
203
|
+
is_current_cond = F.col(f"{self._exist_alias}.{self.current_flag_col}")
|
|
204
|
+
select_exprs = [F.col(f"{self._new_alias}.{c}").alias(c) for c in columns]
|
|
205
|
+
return (
|
|
206
|
+
df_joined.filter(is_current_cond).filter(change_cond).select(select_exprs)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def _get_unchanged_records(
|
|
210
|
+
self, df_joined: DataFrame, unchanged_cond: Column, columns: List[str]
|
|
211
|
+
) -> DataFrame:
|
|
212
|
+
"""Extracts existing active records that have no changes.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
df_joined(DataFrame): The combined/joined PySpark DataFrame.
|
|
216
|
+
unchanged_cond(Column): PySpark Column condition for identity.
|
|
217
|
+
columns(List[str]): List of output columns required.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
DataFrame: PySpark DataFrame containing untouched active rows.
|
|
221
|
+
"""
|
|
222
|
+
is_current_cond = F.col(f"{self._exist_alias}.{self.current_flag_col}")
|
|
223
|
+
select_exprs = [F.col(f"{self._exist_alias}.{c}").alias(c) for c in columns]
|
|
224
|
+
return (
|
|
225
|
+
df_joined.filter(is_current_cond)
|
|
226
|
+
.filter(unchanged_cond)
|
|
227
|
+
.select(select_exprs)
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
def _get_expired_records(
|
|
231
|
+
self, df_joined: DataFrame, change_cond: Column, columns: List[str]
|
|
232
|
+
) -> DataFrame:
|
|
233
|
+
"""Transforms existing active rows into historically closed versions.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
df_joined(DataFrame): The combined/joined PySpark DataFrame.
|
|
237
|
+
change_cond(Column): PySpark Column condition for changes.
|
|
238
|
+
columns(List[str]): List of output columns required.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
DataFrame: PySpark DataFrame containing expired records.
|
|
242
|
+
"""
|
|
243
|
+
is_current_cond = F.col(f"{self._exist_alias}.{self.current_flag_col}")
|
|
244
|
+
|
|
245
|
+
select_exprs = []
|
|
246
|
+
for c in columns:
|
|
247
|
+
if c not in [self.end_date_col, self.current_flag_col]:
|
|
248
|
+
select_exprs.append(F.col(f"{self._exist_alias}.{c}").alias(c))
|
|
249
|
+
elif c == self.end_date_col:
|
|
250
|
+
new_ts_col = f"{self._new_alias}.{self.input_timestamp_col}"
|
|
251
|
+
select_exprs.append(F.col(new_ts_col).alias(self.end_date_col))
|
|
252
|
+
else:
|
|
253
|
+
select_exprs.append(F.lit(False).alias(self.current_flag_col))
|
|
254
|
+
|
|
255
|
+
return (
|
|
256
|
+
df_joined.filter(is_current_cond).filter(change_cond).select(select_exprs)
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
def _get_preserved_records(
|
|
260
|
+
self, df_new: DataFrame, df_existing: DataFrame, columns: List[str]
|
|
261
|
+
) -> DataFrame:
|
|
262
|
+
"""Preserves active records missing from the new batch.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
df_new(DataFrame): The raw incoming PySpark DataFrame.
|
|
266
|
+
df_existing(DataFrame): The existing dimension table.
|
|
267
|
+
columns(List[str]): List of output columns required.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
DataFrame: PySpark DataFrame containing unaffected rows.
|
|
271
|
+
"""
|
|
272
|
+
is_current_cond = F.col(f"{self._exist_alias}.{self.current_flag_col}")
|
|
273
|
+
select_exprs = [F.col(f"{self._exist_alias}.{c}").alias(c) for c in columns]
|
|
274
|
+
return (
|
|
275
|
+
df_existing.alias(self._exist_alias)
|
|
276
|
+
.join(
|
|
277
|
+
df_new.alias(self._new_alias),
|
|
278
|
+
on=self.business_keys,
|
|
279
|
+
how="left_anti",
|
|
280
|
+
)
|
|
281
|
+
.filter(is_current_cond)
|
|
282
|
+
.select(select_exprs)
|
|
283
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Tu Nombre Completo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: data-engineering-exp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A minimalist, agnostic Python framework to standardize data engineering pipelines.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: data-engineering,pyspark,pandas,data-catalog
|
|
7
|
+
Author: idperez720
|
|
8
|
+
Author-email: ivandavidperez4@gmail.com
|
|
9
|
+
Requires-Python: >=3.11,<4.0.0
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Dist: click (>=8.4.1,<9.0.0)
|
|
17
|
+
Requires-Dist: pandas (>=3.0.3,<4.0.0)
|
|
18
|
+
Requires-Dist: pyspark (>=4.1.2,<5.0.0)
|
|
19
|
+
Requires-Dist: pyyaml (>=6.0.3,<7.0.0)
|
|
20
|
+
Project-URL: Repository, https://github.com/idperez720/data-engineering-exp
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# Data Engineering Experience š
|
|
24
|
+
|
|
25
|
+
[](https://opensource.org/licenses/MIT)
|
|
26
|
+
[](https://www.python.org)
|
|
27
|
+
|
|
28
|
+
**dex** (*Data Engineering Experience*) is a minimalist, agnostic Python framework designed to streamline and standardize data engineering pipelines. By embracing **Convention over Configuration**, `dex` eliminates environment friction, absolute path hardcoding, and complex PySpark session management.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## ⨠Key Features
|
|
33
|
+
|
|
34
|
+
* **Zero-Config File Discovery:** Automatic tree-walking directory resolution anchors your data catalog using your local `pyproject.toml` file.
|
|
35
|
+
* **Decentralized Catalog:** Declare your metadata layouts inside modular, self-contained mini-YAML files.
|
|
36
|
+
* **Elastic Processing Runtimes:** Switch dynamically between **Pandas** and **PySpark** execution engines using exactly the same unified interface.
|
|
37
|
+
* **Interactive CLI Scaffolding:** Spin up a new production-ready data directory structure instantly with `dex init`.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## š¦ Installation
|
|
42
|
+
|
|
43
|
+
*(Once published to PyPI)*
|
|
44
|
+
```bash
|
|
45
|
+
pip install dex
|
|
46
|
+
```
|
|
47
|
+
Or install it directly from the source repository using Poetry:
|
|
48
|
+
```bash
|
|
49
|
+
poetry add git+[https://github.com/idperez720/data-engineering-exp.git](https://github.com/idperez720/data-engineering-exp.git)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## š Quick Start
|
|
55
|
+
|
|
56
|
+
### 1. Initialize your workspace
|
|
57
|
+
|
|
58
|
+
Navigate to an empty directory and let the interactive wizard scaffold the workspace conventions:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
dex init
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### 2. Declare a dataset
|
|
66
|
+
|
|
67
|
+
Add a specification block inside `conf/catalog/sample_dataset.yaml`:
|
|
68
|
+
|
|
69
|
+
```yaml
|
|
70
|
+
customers:
|
|
71
|
+
description: "Main production customer data"
|
|
72
|
+
format: "csv"
|
|
73
|
+
engine: "pandas"
|
|
74
|
+
storage_path: "data/sample_table.csv"
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### 3. Load data anywhere
|
|
79
|
+
|
|
80
|
+
Create a Python script or open a Jupyter Notebook inside `src/notebooks/` and fetch your data instantly:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from data_engineering_exp.core.io import DataLoader
|
|
84
|
+
|
|
85
|
+
# Autodiscovers your project root boundaries and settings
|
|
86
|
+
loader = DataLoader()
|
|
87
|
+
|
|
88
|
+
# Loads the dataset securely as a Pandas DataFrame
|
|
89
|
+
df = loader.load("customers")
|
|
90
|
+
df.head()
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## š Complete Documentation
|
|
97
|
+
|
|
98
|
+
For comprehensive guides, testing architecture deep-dives, and complete API references, visit our documentation site:
|
|
99
|
+
š **[http://127.0.0.1:8000/](https://www.google.com/search?q=http://127.0.0.1:8000/)** *(Replace with your deployed docs URL, e.g., GitHub Pages)*
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## āļø License
|
|
104
|
+
|
|
105
|
+
Distributed under the **MIT License**. Any modification or distribution (including forks) must include the original copyright notice and liability waiver. See `LICENSE` for more information.
|
|
106
|
+
|
|
107
|
+
```
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
data_engineering_exp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
data_engineering_exp/cli.py,sha256=KNQ8Ei4_wU-MTEH_pca3cOhwQJ0rV0wAZ3FwHtWuht8,1783
|
|
3
|
+
data_engineering_exp/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
data_engineering_exp/core/catalog.py,sha256=wu4QnsHus9QJi6dhQR_Eu4-elxAY-dTEor2goZpuE-U,7226
|
|
5
|
+
data_engineering_exp/core/deduplication.py,sha256=aLJrJtVQG56wI39ImSKvsIaq1U36hH7_-t4CjB3fwGw,6472
|
|
6
|
+
data_engineering_exp/core/initializer.py,sha256=Q0Kx7U5HxNu19pQYGvxxfy-1TVeFpHF4jMXKfji2i9c,3484
|
|
7
|
+
data_engineering_exp/core/io.py,sha256=VIzAEP5iQduUJ7OWV2K82iAdelpkPT9lk7NqU1BuRXA,4825
|
|
8
|
+
data_engineering_exp/pandas_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
data_engineering_exp/pandas_core/scd2.py,sha256=KGeRYVv2ClxEoO9WUEVfF5Bdll3l-clJmIqYlT5EbZQ,8668
|
|
10
|
+
data_engineering_exp/spark_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
data_engineering_exp/spark_core/scd2.py,sha256=Zza0ePChxq9TV74OK5gr8odhUF7sDiX_lmoAS2qwyV4,10841
|
|
12
|
+
data_engineering_exp-0.1.0.dist-info/LICENSE,sha256=AVQrXW_hCAJSW_1PiMnJmic2hJ5aT9ypRcQEqIY4sdw,1074
|
|
13
|
+
data_engineering_exp-0.1.0.dist-info/METADATA,sha256=V3OeZlQeGXkdJdiuHzAgmrTQys6BJ0o7lgM7SG8W2a8,3607
|
|
14
|
+
data_engineering_exp-0.1.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
15
|
+
data_engineering_exp-0.1.0.dist-info/entry_points.txt,sha256=xHkLyRWvtD12WAnIZ4p0XnzRvhvr3Mt0wj5x8hcJ07A,60
|
|
16
|
+
data_engineering_exp-0.1.0.dist-info/RECORD,,
|