data-engineering-exp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,63 @@
1
+ """Command Line Interface (CLI) entry point for dex using Click."""
2
+
3
+ import os
4
+
5
+ import click
6
+
7
+ from data_engineering_exp.core.initializer import ProjectInitializer
8
+
9
+
10
+ @click.group()
11
+ def entry_point() -> None:
12
+ """dex - Data Engineering Experience CLI utilities.
13
+
14
+ Args:
15
+
16
+ Returns:
17
+ None: Main CLI entry group.
18
+ """
19
+ pass
20
+
21
+
22
+ @entry_point.command()
23
+ @click.option(
24
+ "--path",
25
+ type=click.Path(file_okay=False, dir_okay=True),
26
+ default=".",
27
+ help="Root directory where scaffolding will be built. Defaults to '.'",
28
+ )
29
+ def init(path: str) -> None:
30
+ """Scaffold a new data engineering project structure interactively.
31
+
32
+ Args:
33
+ path(str): Target directory string path for project structure.
34
+
35
+ Returns:
36
+ None: Side-effect creating folders and configurations.
37
+ """
38
+ click.echo("šŸš€ Welcome to the dex project initialization wizard!\n")
39
+
40
+ # Derives default project name dynamically from the target directory name
41
+ default_name = os.path.basename(os.path.abspath(path))
42
+ if not default_name or default_name in (".", ""):
43
+ default_name = "my-dex-project"
44
+
45
+ # Sequential metadata prompts with default fallbacks
46
+ name = click.prompt("Project name", default=default_name, type=str)
47
+ version = click.prompt("Version", default="0.1.0", type=str)
48
+ description = click.prompt(
49
+ "Description",
50
+ default="Data engineering project scaffolded by dex",
51
+ type=str,
52
+ )
53
+ author = click.prompt("Author", default="Anonymous", type=str)
54
+
55
+ initializer = ProjectInitializer(base_path=path)
56
+ initializer.init_project(
57
+ name=name,
58
+ version=version,
59
+ description=description,
60
+ author=author,
61
+ )
62
+
63
+ click.echo(f"\n✨ Project successfully initialized at '{path}'!")
File without changes
@@ -0,0 +1,197 @@
1
+ """This module implements decentralized data catalog engines for dex."""
2
+
3
+ import os
4
+ from typing import Any, Dict, List, Optional, cast
5
+
6
+ import yaml
7
+
8
+
9
+ class DataCatalog:
10
+ """Parses and consolidates declarative configurations from files or directories.
11
+
12
+ Scans catalog definitions recursively, allowing user architectures to be
13
+ split into mini-YAML files or kept within a single configuration layout.
14
+ """
15
+
16
+ def __init__(self, catalog_path: Optional[str] = None) -> None:
17
+ """Initializes the DataCatalog by scanning files or configuration paths.
18
+
19
+ If no path is provided, it automatically seeks upwards from the current
20
+ working directory to find the standard 'conf/catalog' directory based on
21
+ the pyproject.toml root anchor file.
22
+
23
+ Args:
24
+ catalog_path(Optional[str]): System directory path or file string
25
+ pointing to catalog definitions. Defaults to None.
26
+
27
+ Returns:
28
+ None: Initializes the class instance.
29
+ """
30
+ self.project_root: str = ""
31
+
32
+ if catalog_path is None:
33
+ catalog_path = self._discover_catalog_path()
34
+ else:
35
+ # Derive the root directory context if an explicit path is supplied
36
+ abs_path = os.path.abspath(catalog_path)
37
+ if os.path.isfile(abs_path):
38
+ self.project_root = os.path.dirname(
39
+ os.path.dirname(os.path.dirname(abs_path))
40
+ )
41
+ else:
42
+ self.project_root = os.path.dirname(os.path.dirname(abs_path))
43
+
44
+ if not os.path.exists(catalog_path):
45
+ raise FileNotFoundError(f"Catalog source path not found at: {catalog_path}")
46
+
47
+ self._datasets: Dict[str, Dict[str, Any]] = {}
48
+ self._load_catalog_sources(catalog_path)
49
+
50
+ @property
51
+ def dataset_names(self) -> List[str]:
52
+ """Exposes the list of all dataset identifiers present in the catalog.
53
+
54
+ Args:
55
+
56
+ Returns:
57
+ List[str]: List of string names for all loaded datasets.
58
+ """
59
+ return list(self._datasets.keys())
60
+
61
+ def get_dataset_metadata(self, dataset_name: str) -> Dict[str, Any]:
62
+ """Retrieves core metadata mapping configurations for a target dataset.
63
+
64
+ Args:
65
+ dataset_name(str): Unique identifier string of the dataset.
66
+
67
+ Returns:
68
+ Dict[str, Any]: Dictionary containing properties like description,
69
+ format, primary_keys, and storage_path.
70
+
71
+ Raises:
72
+ KeyError: If the target dataset name is missing from the catalog.
73
+ """
74
+ if dataset_name not in self._datasets:
75
+ raise KeyError(f"Dataset '{dataset_name}' missing from catalog.")
76
+
77
+ meta = self._datasets[dataset_name].copy()
78
+ meta.pop("columns", None)
79
+ return meta
80
+
81
+ def get_column_names(self, dataset_name: str) -> List[str]:
82
+ """Extracts the expected list of column names for a target dataset.
83
+
84
+ Args:
85
+ dataset_name(str): Unique identifier string of the dataset.
86
+
87
+ Returns:
88
+ List[str]: Ordered list of strings representing expected columns.
89
+
90
+ Raises:
91
+ KeyError: If the target dataset name is missing from the catalog.
92
+ """
93
+ if dataset_name not in self._datasets:
94
+ raise KeyError(f"Dataset '{dataset_name}' missing from catalog.")
95
+
96
+ columns_spec = self._datasets[dataset_name].get("columns", [])
97
+ return [col["name"] for col in columns_spec]
98
+
99
+ def validate_schema_presence(self, df: Any, dataset_name: str) -> bool:
100
+ """Validates that all columns defined in the catalog exist in the dataframe.
101
+
102
+ Args:
103
+ df(Any): Active input Pandas or PySpark DataFrame object.
104
+ dataset_name(str): Catalog identifier string of the dataset target.
105
+
106
+ Returns:
107
+ bool: True if all expected columns are present, raises ValueError
108
+ otherwise.
109
+
110
+ Raises:
111
+ TypeError: If the input dataframe structure is not supported.
112
+ ValueError: If a column mismatch is discovered against the catalog.
113
+ """
114
+ expected_cols = set(self.get_column_names(dataset_name))
115
+
116
+ df_type = type(df).__name__
117
+ if df_type == "DataFrame" and "pandas" in type(df).__module__:
118
+ actual_cols = set(df.columns.tolist())
119
+ elif df_type == "DataFrame" and "pyspark" in type(df).__module__:
120
+ actual_cols = set(df.columns)
121
+ else:
122
+ raise TypeError("Unsupported DataFrame type for validation.")
123
+
124
+ missing_cols = expected_cols - actual_cols
125
+ if missing_cols:
126
+ raise ValueError(
127
+ f"Schema mismatch for '{dataset_name}'. "
128
+ f"Missing expected catalog columns: {list(missing_cols)}"
129
+ )
130
+
131
+ return True
132
+
133
+ def _discover_catalog_path(self) -> str:
134
+ """Traverses parent directories upwards to locate the pyproject.toml file.
135
+
136
+ Once the layout anchor file is found, it automatically appends the standard
137
+ 'conf/catalog' structural workspace mapping.
138
+
139
+ Returns:
140
+ str: Absolute path to the discovered catalog directory.
141
+
142
+ Raises:
143
+ FileNotFoundError: If the pyproject.toml configuration cannot be found.
144
+ """
145
+ current_dir = os.getcwd()
146
+
147
+ while True:
148
+ potential_toml = os.path.join(current_dir, "pyproject.toml")
149
+ if os.path.exists(potential_toml):
150
+ self.project_root = current_dir
151
+ return os.path.join(current_dir, "conf", "catalog")
152
+
153
+ parent_dir = os.path.dirname(current_dir)
154
+ if parent_dir == current_dir:
155
+ break
156
+ current_dir = parent_dir
157
+
158
+ raise FileNotFoundError(
159
+ "Configuration file (pyproject.toml) could not be located. "
160
+ "Please run 'dex init' to establish a valid project layout."
161
+ )
162
+
163
+ def _load_catalog_sources(self, path: str) -> None:
164
+ """Internal worker to process and parse path targets recursively.
165
+
166
+ Args:
167
+ path(str): Targeting file path or folder configuration directory.
168
+
169
+ Returns:
170
+ None: Populates internal dictionary instances.
171
+ """
172
+ if os.path.isfile(path):
173
+ if path.endswith((".yml", ".yaml")):
174
+ self._parse_file(path)
175
+ return
176
+
177
+ for root, _, files in os.walk(path):
178
+ for file in files:
179
+ if file.endswith((".yml", ".yaml")):
180
+ full_path = os.path.join(root, file)
181
+ self._parse_file(full_path)
182
+
183
+ def _parse_file(self, file_path: str) -> None:
184
+ """Parses an individual YAML catalog file and updates target states.
185
+
186
+ Args:
187
+ file_path(str): Exact system string location pointing to file.
188
+
189
+ Returns:
190
+ None: Updates the core datasets dictionary data states.
191
+ """
192
+ with open(file_path, "r", encoding="utf-8") as stream:
193
+ content = yaml.safe_load(stream)
194
+
195
+ if content and isinstance(content, dict):
196
+ typed_content = cast(Dict[str, Dict[str, Any]], content)
197
+ self._datasets.update(typed_content)
@@ -0,0 +1,192 @@
1
+ """This module implements versatile data deduplication utilities for dex."""
2
+
3
+ from typing import Any, List, Optional, Union
4
+
5
+ # Handle optional dependencies at the top of the file
6
+ try:
7
+ import pyspark.sql.functions as F
8
+ from pyspark.sql import Window
9
+
10
+ HAS_SPARK = True
11
+ except ImportError:
12
+ Window: Any = None
13
+ F: Any = None
14
+ HAS_SPARK = False
15
+
16
+
17
+ class Deduplicator:
18
+ """Utility class providing multiple strategies for data deduplication.
19
+
20
+ Supports Pandas and PySpark DataFrames dynamically, allowing extraction of
21
+ first, latest, multi-sorted, consolidated, or strictly unique records.
22
+ """
23
+
24
+ def __init__(self):
25
+ """Initializes the Deduplicator instance.
26
+
27
+ Args:
28
+
29
+ Returns:
30
+ None: Initializes the class instance.
31
+ """
32
+ pass
33
+
34
+ def latest(self, df: Any, keys: List[str], order_by_col: str) -> Any:
35
+ """Extracts the most recent record per business key group.
36
+
37
+ Args:
38
+ df(Any): Input Pandas or PySpark DataFrame.
39
+ keys(List[str]): Columns that identify a business entity.
40
+ order_by_col(str): Column used to determine chronological order.
41
+
42
+ Returns:
43
+ Any: Deduplicated DataFrame containing only the latest events.
44
+ """
45
+ framework = self._detect_framework(df)
46
+ if framework == "pandas":
47
+ return (
48
+ df.sort_values(by=order_by_col, ascending=True)
49
+ .groupby(keys)
50
+ .last()
51
+ .reset_index()
52
+ )
53
+
54
+ window_spec = Window.partitionBy(keys).orderBy(F.col(order_by_col).desc())
55
+ return (
56
+ df.withColumn("_row_num", F.row_number().over(window_spec))
57
+ .filter(F.col("_row_num") == 1)
58
+ .drop("_row_num")
59
+ )
60
+
61
+ def first(self, df: Any, keys: List[str], order_by_col: str) -> Any:
62
+ """Extracts the earliest chronological record per business key group.
63
+
64
+ Args:
65
+ df(Any): Input Pandas or PySpark DataFrame.
66
+ keys(List[str]): Columns that identify a business entity.
67
+ order_by_col(str): Column used to determine chronological order.
68
+
69
+ Returns:
70
+ Any: Deduplicated DataFrame containing only the first events.
71
+ """
72
+ framework = self._detect_framework(df)
73
+ if framework == "pandas":
74
+ return (
75
+ df.sort_values(by=order_by_col, ascending=True)
76
+ .groupby(keys)
77
+ .first()
78
+ .reset_index()
79
+ )
80
+
81
+ window_spec = Window.partitionBy(keys).orderBy(F.col(order_by_col).asc())
82
+ return (
83
+ df.withColumn("_row_num", F.row_number().over(window_spec))
84
+ .filter(F.col("_row_num") == 1)
85
+ .drop("_row_num")
86
+ )
87
+
88
+ def distinct(self, df: Any, subset: Optional[List[str]] = None) -> Any:
89
+ """Removes strict duplicate rows from the DataFrame.
90
+
91
+ Args:
92
+ df(Any): Input Pandas or PySpark DataFrame.
93
+ subset(Optional[List[str]]): Specific columns to consider when
94
+ identifying duplicates. Defaults to None (all columns).
95
+
96
+ Returns:
97
+ Any: Clean DataFrame with unique records only.
98
+ """
99
+ framework = self._detect_framework(df)
100
+ if framework == "pandas":
101
+ return df.drop_duplicates(subset=subset).reset_index(drop=True)
102
+
103
+ return df.dropDuplicates(subset=subset)
104
+
105
+ def by_order(
106
+ self,
107
+ df: Any,
108
+ keys: List[str],
109
+ order_by_cols: List[str],
110
+ ascending: Union[bool, List[bool]] = True,
111
+ ) -> Any:
112
+ """Deduplicates rows by sorting through multiple custom column criteria.
113
+
114
+ Args:
115
+ df(Any): Input Pandas or PySpark DataFrame.
116
+ keys(List[str]): Columns that identify a business entity.
117
+ order_by_cols(List[str]): Ordered list of columns to sort by.
118
+ ascending(Union[bool, List[bool]]): Type of sorting direction for
119
+ each column. Defaults to True.
120
+
121
+ Returns:
122
+ Any: Deduplicated DataFrame containing the top-ranked records.
123
+ """
124
+ framework = self._detect_framework(df)
125
+ if framework == "pandas":
126
+ return (
127
+ df.sort_values(by=order_by_cols, ascending=ascending)
128
+ .groupby(keys)
129
+ .first()
130
+ .reset_index()
131
+ )
132
+
133
+ if isinstance(ascending, bool):
134
+ asc_list = [ascending] * len(order_by_cols)
135
+ else:
136
+ asc_list = ascending
137
+
138
+ order_exprs = []
139
+ for col, asc_dir in zip(order_by_cols, asc_list):
140
+ expr = F.col(col).asc() if asc_dir else F.col(col).desc()
141
+ order_exprs.append(expr)
142
+
143
+ window_spec = Window.partitionBy(keys).orderBy(*order_exprs)
144
+ return (
145
+ df.withColumn("_row_num", F.row_number().over(window_spec))
146
+ .filter(F.col("_row_num") == 1)
147
+ .drop("_row_num")
148
+ )
149
+
150
+ def combined(self, df: Any, keys: List[str]) -> Any:
151
+ """Stitches rows together by compacting nulls into a golden record.
152
+
153
+ Args:
154
+ df(Any): Input Pandas or PySpark DataFrame.
155
+ keys(List[str]): Columns that identify a business entity.
156
+
157
+ Returns:
158
+ Any: Consolidated DataFrame with filled attributes per group.
159
+ """
160
+ framework = self._detect_framework(df)
161
+ if framework == "pandas":
162
+ return df.groupby(keys).first().reset_index()
163
+
164
+ agg_cols = [
165
+ F.first(c, ignorenulls=True).alias(c) for c in df.columns if c not in keys
166
+ ]
167
+ return df.groupBy(keys).agg(*agg_cols)
168
+
169
+ def _detect_framework(self, df: Any) -> str:
170
+ """Internal helper to identify the dataframe framework type safely.
171
+
172
+ Args:
173
+ df(Any): Input dataframe object.
174
+
175
+ Returns:
176
+ str: String identifier ("pandas" or "spark").
177
+ """
178
+ df_type = type(df).__name__
179
+ module_type = type(df).__module__
180
+
181
+ if "pandas" in module_type and df_type == "DataFrame":
182
+ return "pandas"
183
+
184
+ if "pyspark" in module_type and df_type == "DataFrame":
185
+ if not HAS_SPARK:
186
+ raise ImportError("PySpark is required but could not be imported.")
187
+ return "spark"
188
+
189
+ raise TypeError(
190
+ f"Unsupported type: {type(df)}. "
191
+ "Only Pandas and PySpark DataFrames are supported."
192
+ )
@@ -0,0 +1,93 @@
1
+ """This module implements the project structural initialization engine for dex."""
2
+
3
+ import os
4
+
5
+
6
+ class ProjectInitializer:
7
+ """Handles scaffolding of directory structures for new data engineering setups.
8
+
9
+ Automates the creation of standard configuration folders and source directories
10
+ to establish a unified project layout.
11
+ """
12
+
13
+ def __init__(self, base_path: str = ".") -> None:
14
+ """Initializes the ProjectInitializer with a target base path.
15
+
16
+ Args:
17
+ base_path(str): Root system path where scaffolding will be built.
18
+ Defaults to ".".
19
+
20
+ Returns:
21
+ None: Initializes the class instance.
22
+ """
23
+ self.base_path = base_path
24
+
25
+ def init_project(
26
+ self,
27
+ name: str = "my-dex-project",
28
+ version: str = "0.1.0",
29
+ description: str = "Data engineering project scaffolded by dex",
30
+ author: str = "Anonymous",
31
+ ) -> None:
32
+ """Scaffolds the required folders and creates baseline configuration templates.
33
+
34
+ Args:
35
+ name(str): Operational name of the data project. Defaults to
36
+ "my-dex-project".
37
+ version(str): Initial semver string version. Defaults to "0.1.0".
38
+ description(str): Short purpose statement describing the repository.
39
+ Defaults to "Data engineering project scaffolded by dex".
40
+ author(str): Full name or alias identifier of the creator. Defaults
41
+ to "Anonymous".
42
+
43
+ Returns:
44
+ None: Side-effect function creating disk structures.
45
+ """
46
+ folders = [
47
+ os.path.join(self.base_path, "conf", "catalog"),
48
+ os.path.join(self.base_path, "src", "notebooks"),
49
+ os.path.join(self.base_path, "data"),
50
+ ]
51
+
52
+ for folder in folders:
53
+ os.makedirs(folder, exist_ok=True)
54
+
55
+ # Create a clean, standard pyproject.toml as the anchor of the project
56
+ toml_path = os.path.join(self.base_path, "pyproject.toml")
57
+ toml_content = (
58
+ "[project]\n"
59
+ f'name = "{name}"\n'
60
+ f'version = "{version}"\n'
61
+ f'description = "{description}"\n'
62
+ "authors = [\n"
63
+ f' {{name = "{author}"}}\n'
64
+ "]\n"
65
+ 'requires-python = ">=3.11,<4.0.0"\n'
66
+ )
67
+ with open(toml_path, "w", encoding="utf-8") as file:
68
+ file.write(toml_content)
69
+
70
+ # Drop a real physical sample CSV file into the new data folder
71
+ csv_path = os.path.join(self.base_path, "data", "sample_table.csv")
72
+ csv_content = "id,name\n1,Alice\n2,Bob\n"
73
+ with open(csv_path, "w", encoding="utf-8") as csv_file:
74
+ csv_file.write(csv_content)
75
+
76
+ # Inject a sample boilerplate YAML catalog pointing to the real CSV
77
+ sample_catalog_path = os.path.join(
78
+ self.base_path, "conf", "catalog", "sample_dataset.yaml"
79
+ )
80
+ sample_content = (
81
+ "sample_table:\n"
82
+ " description: 'Boilerplate example dataset created by dex'\n"
83
+ " format: 'csv'\n"
84
+ " engine: 'pandas'\n"
85
+ " storage_path: 'data/sample_table.csv'\n"
86
+ " columns:\n"
87
+ " - name: 'id'\n"
88
+ " type: 'integer'\n"
89
+ " - name: 'name'\n"
90
+ " type: 'string'\n"
91
+ )
92
+ with open(sample_catalog_path, "w", encoding="utf-8") as file:
93
+ file.write(sample_content)
@@ -0,0 +1,134 @@
1
+ """This module implements metadata-driven data loading utilities for dex."""
2
+
3
+ import os
4
+ from typing import Any, Optional, cast
5
+
6
+ import pandas as pd
7
+
8
+ from data_engineering_exp.core.catalog import DataCatalog
9
+
10
+ try:
11
+ from pyspark.sql import SparkSession
12
+
13
+ HAS_SPARK = True
14
+ except ImportError:
15
+ SparkSession = cast(Any, None)
16
+ HAS_SPARK = False
17
+
18
+
19
+ class DataLoader:
20
+ """Handles dynamic loading of datasets using catalog metadata definitions.
21
+
22
+ Supports loading data via Pandas or PySpark based on the specified
23
+ execution engine and format configurations.
24
+ """
25
+
26
+ def __init__(self, catalog: Optional[DataCatalog] = None) -> None:
27
+ """Initializes the DataLoader with a DataCatalog instance.
28
+
29
+ Args:
30
+ catalog(Optional[DataCatalog]): An instance of DataCatalog. If
31
+ None, a new instance is automatically discovered.
32
+
33
+ Returns:
34
+ None: Initializes the class instance.
35
+ """
36
+ self.catalog = catalog if catalog is not None else DataCatalog()
37
+
38
+ def load(self, dataset_name: str, spark: Optional[Any] = None) -> Any:
39
+ """Loads a dataset from storage based on its catalog specification.
40
+
41
+ Args:
42
+ dataset_name(str): Unique identifier string of the dataset.
43
+ spark(Optional[Any]): Active PySpark SparkSession instance. Required
44
+ if engine is 'spark' and no active session is globally found.
45
+
46
+ Returns:
47
+ Any: Loaded Pandas or PySpark DataFrame object.
48
+
49
+ Raises:
50
+ KeyError: If mandatory keys like 'engine', 'format' or
51
+ 'storage_path' are missing from metadata.
52
+ ValueError: If an unsupported engine or format is provided.
53
+ ImportError: If the PySpark engine is requested but not installed.
54
+ """
55
+ meta = self.catalog.get_dataset_metadata(dataset_name)
56
+
57
+ engine = meta.get("engine")
58
+ data_format = meta.get("format")
59
+ path = meta.get("storage_path")
60
+
61
+ if not engine or not data_format or not path:
62
+ raise KeyError(
63
+ f"Dataset '{dataset_name}' metadata must contain "
64
+ f"'engine', 'format', and 'storage_path'."
65
+ )
66
+
67
+ # Convention resolution: anchor relative paths safely to project root
68
+ if not os.path.isabs(path):
69
+ path = os.path.normpath(os.path.join(self.catalog.project_root, path))
70
+
71
+ if engine == "pandas":
72
+ return self._load_with_pandas(path, data_format)
73
+ elif engine == "spark":
74
+ return self._load_with_spark(path, data_format, spark)
75
+ else:
76
+ raise ValueError(f"Unsupported execution engine: '{engine}'.")
77
+
78
+ def _load_with_pandas(self, path: str, data_format: str) -> pd.DataFrame:
79
+ """Internal helper to route reading operations to Pandas.
80
+
81
+ Args:
82
+ path(str): Target system string file location path.
83
+ data_format(str): File format layout specification identifier.
84
+
85
+ Returns:
86
+ pd.DataFrame: Loaded Pandas DataFrame object.
87
+
88
+ Raises:
89
+ ValueError: If the file format layout is unsupported by Pandas.
90
+ """
91
+ if data_format == "parquet":
92
+ return pd.read_parquet(path)
93
+ elif data_format == "csv":
94
+ return pd.read_csv(path)
95
+ else:
96
+ raise ValueError(f"Unsupported Pandas format: '{data_format}'.")
97
+
98
+ def _load_with_spark(
99
+ self, path: str, data_format: str, spark: Optional[Any]
100
+ ) -> Any:
101
+ """Internal worker to route reading operations to PySpark.
102
+
103
+ Args:
104
+ path(str): Target system string file location path.
105
+ data_format(str): File format layout specification identifier.
106
+ spark(Optional[Any]): Explicit user supplied SparkSession object.
107
+
108
+ Returns:
109
+ Any: Loaded PySpark DataFrame object.
110
+
111
+ Raises:
112
+ ImportError: If the PySpark dependency library is uninstalled.
113
+ ValueError: If no valid operational Spark session context is found
114
+ or if the layout format is unsupported.
115
+ """
116
+ if not HAS_SPARK:
117
+ raise ImportError("PySpark is required but could not be imported.")
118
+
119
+ session = spark
120
+ if session is None:
121
+ session = SparkSession.getActiveSession()
122
+
123
+ if session is None:
124
+ raise ValueError(
125
+ "A valid SparkSession must be provided or active globally "
126
+ "to load data using the 'spark' engine."
127
+ )
128
+
129
+ if data_format == "parquet":
130
+ return session.read.parquet(path)
131
+ elif data_format == "csv":
132
+ return session.read.csv(path, header=True, inferSchema=True)
133
+ else:
134
+ raise ValueError(f"Unsupported Spark format: '{data_format}'.")
File without changes
@@ -0,0 +1,225 @@
1
+ """This module implements Slowly Changing Dimension Type 2 (SCD2) logic using Pandas."""
2
+
3
+ from typing import List
4
+
5
+ import pandas as pd
6
+
7
+
8
+ class PandasSCD2Processor:
9
+ """Handles Slowly Changing Dimension Type 2 (SCD2) logic using Pandas.
10
+
11
+ This class encapsulates configuration parameters such as keys and metadata
12
+ columns, allowing clean and repeatable executions of SCD2 updates.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ business_keys: List[str],
18
+ compare_columns: List[str],
19
+ effective_date_col: str = "effective_date",
20
+ end_date_col: str = "end_date",
21
+ current_flag_col: str = "is_current",
22
+ input_timestamp_col: str = "input_timestamp",
23
+ max_date: str = "9999-12-31",
24
+ ):
25
+ """Initializes the PandasSCD2Processor with configuration parameters.
26
+
27
+ Args:
28
+ business_keys(List[str]): List of columns that uniquely identify
29
+ a record.
30
+ compare_columns(List[str]): List of columns used to detect
31
+ changes in the data.
32
+ effective_date_col(str): Name of the effective start date column.
33
+ Defaults to "effective_date".
34
+ end_date_col(str): Name of the effective end date column.
35
+ Defaults to "end_date".
36
+ current_flag_col(str): Name of the boolean flag column
37
+ indicating the active record. Defaults to "is_current".
38
+ input_timestamp_col(str): Name of the column containing the
39
+ ingestion/mutation timestamp. Defaults to "input_timestamp".
40
+ max_date(str): The maximum date string used to populate active
41
+ records. Defaults to "9999-12-31".
42
+
43
+ Returns:
44
+ None: This method initializes the class instance.
45
+ """
46
+ self.business_keys = business_keys
47
+ self.compare_columns = compare_columns
48
+ self.effective_date_col = effective_date_col
49
+ self.end_date_col = end_date_col
50
+ self.current_flag_col = current_flag_col
51
+ self.input_timestamp_col = input_timestamp_col
52
+ self.max_date = max_date
53
+
54
+ def process(
55
+ self, df_new: pd.DataFrame, df_existing: pd.DataFrame
56
+ ) -> pd.DataFrame:
57
+ """Orchestrates the SCD2 pipeline process and returns result.
58
+
59
+ Args:
60
+ df_new(pd.DataFrame): Incoming batch of data.
61
+ df_existing(pd.DataFrame): Current state of dimension.
62
+
63
+ Returns:
64
+ pd.DataFrame: Unified DataFrame with all processed records.
65
+ """
66
+ if not self.compare_columns:
67
+ raise ValueError(
68
+ "compare_columns cannot be empty for SCD2 processing."
69
+ )
70
+
71
+ # Filtrar solo los registros activos de la dimensión histórica
72
+ # Avoid equality comparison to True; use truthiness check instead
73
+ df_active = df_existing[df_existing[self.current_flag_col]]
74
+
75
+ # Indexar por llaves de negocio para aprovechar alineación de Pandas
76
+ df_exist_idx = df_active.set_index(self.business_keys)
77
+ df_new_idx = df_new.set_index(self.business_keys)
78
+
79
+ # Identificar segmentos por llaves utilizando operaciones de conjuntos
80
+ new_keys = df_new_idx.index.difference(df_exist_idx.index)
81
+ common_keys = df_new_idx.index.intersection(df_exist_idx.index)
82
+ preserved_keys = df_exist_idx.index.difference(df_new_idx.index)
83
+
84
+ # Separar registros comunes entre cambiados y no cambiados
85
+ changed_keys, unchanged_keys = self._split_common_keys(
86
+ df_new_idx, df_exist_idx, common_keys
87
+ )
88
+
89
+ # Construir cada segmento del SCD2
90
+ df_new_only = self._get_new_records(df_new_idx, new_keys)
91
+ df_changed = self._get_changed_records(df_new_idx, changed_keys)
92
+ df_expired = self._get_expired_records(
93
+ df_new_idx, df_exist_idx, changed_keys
94
+ )
95
+ df_unchanged = self._get_unchanged_records(
96
+ df_exist_idx, unchanged_keys
97
+ )
98
+ df_preserved = self._get_preserved_records(
99
+ df_exist_idx, preserved_keys
100
+ )
101
+
102
+ # Consolidar y ordenar columnas segĆŗn el esquema original
103
+ target_cols = df_existing.columns.tolist()
104
+ return pd.concat(
105
+ [df_new_only, df_changed, df_expired, df_unchanged, df_preserved],
106
+ ignore_index=True,
107
+ )[target_cols]
108
+
109
+ def _split_common_keys(
110
+ self,
111
+ df_new_idx: pd.DataFrame,
112
+ df_exist_idx: pd.DataFrame,
113
+ common_keys: pd.Index,
114
+ ) -> tuple:
115
+ """Splits common keys into changed and unchanged categories.
116
+
117
+ Args:
118
+ df_new_idx(pd.DataFrame): New data indexed by business keys.
119
+ df_exist_idx(pd.DataFrame): Active existing data indexed by keys.
120
+ common_keys(pd.Index): Intersection of business keys.
121
+
122
+ Returns:
123
+ tuple: A tuple containing (changed_keys, unchanged_keys).
124
+ """
125
+ df_new_common = df_new_idx.loc[common_keys]
126
+ df_exist_common = df_exist_idx.loc[common_keys]
127
+
128
+ change_mask = pd.Series(False, index=common_keys)
129
+ for col in self.compare_columns:
130
+ change_mask |= df_new_common[col] != df_exist_common[col]
131
+
132
+ return common_keys[change_mask], common_keys[~change_mask]
133
+
134
+ def _get_new_records(
135
+ self, df_new_idx: pd.DataFrame, new_keys: pd.Index
136
+ ) -> pd.DataFrame:
137
+ """Extracts and formats brand new records.
138
+
139
+ Args:
140
+ df_new_idx(pd.DataFrame): New data indexed by business keys.
141
+ new_keys(pd.Index): Keys that only exist in the new batch.
142
+
143
+ Returns:
144
+ pd.DataFrame: Formatted dataframe for new insertions.
145
+ """
146
+ df_new_only = df_new_idx.loc[new_keys].reset_index()
147
+ df_new_only[self.effective_date_col] = df_new_only[
148
+ self.input_timestamp_col
149
+ ]
150
+ df_new_only[self.end_date_col] = self.max_date
151
+ df_new_only[self.current_flag_col] = True
152
+ return df_new_only
153
+
154
+ def _get_changed_records(
155
+ self, df_new_idx: pd.DataFrame, changed_keys: pd.Index
156
+ ) -> pd.DataFrame:
157
+ """Extracts and formats new active versions of changed records.
158
+
159
+ Args:
160
+ df_new_idx(pd.DataFrame): New data indexed by business keys.
161
+ changed_keys(pd.Index): Keys with detected attribute modifications.
162
+
163
+ Returns:
164
+ pd.DataFrame: Formatted dataframe for updated active rows.
165
+ """
166
+ df_changed = df_new_idx.loc[changed_keys].reset_index()
167
+ df_changed[self.effective_date_col] = df_changed[
168
+ self.input_timestamp_col
169
+ ]
170
+ df_changed[self.end_date_col] = self.max_date
171
+ df_changed[self.current_flag_col] = True
172
+ return df_changed
173
+
174
+ def _get_expired_records(
175
+ self,
176
+ df_new_idx: pd.DataFrame,
177
+ df_exist_idx: pd.DataFrame,
178
+ changed_keys: pd.Index,
179
+ ) -> pd.DataFrame:
180
+ """Closes historical records by updating end date and flag.
181
+
182
+ Args:
183
+ df_new_idx(pd.DataFrame): New data indexed by business keys.
184
+ df_exist_idx(pd.DataFrame): Active existing data indexed by keys.
185
+ changed_keys(pd.Index): Keys with detected attribute modifications.
186
+
187
+ Returns:
188
+ pd.DataFrame: Formatted dataframe for expired historical rows.
189
+ """
190
+ df_expired = df_exist_idx.loc[changed_keys].reset_index()
191
+ closing_timestamps = df_new_idx.loc[
192
+ changed_keys, self.input_timestamp_col
193
+ ].values
194
+
195
+ df_expired[self.end_date_col] = closing_timestamps
196
+ df_expired[self.current_flag_col] = False
197
+ return df_expired
198
+
199
+ def _get_unchanged_records(
200
+ self, df_exist_idx: pd.DataFrame, unchanged_keys: pd.Index
201
+ ) -> pd.DataFrame:
202
+ """Extracts active records that remain identical.
203
+
204
+ Args:
205
+ df_exist_idx(pd.DataFrame): Active existing data indexed by keys.
206
+ unchanged_keys(pd.Index): Keys with no attribute modifications.
207
+
208
+ Returns:
209
+ pd.DataFrame: Existing active rows without modifications.
210
+ """
211
+ return df_exist_idx.loc[unchanged_keys].reset_index()
212
+
213
+ def _get_preserved_records(
214
+ self, df_exist_idx: pd.DataFrame, preserved_keys: pd.Index
215
+ ) -> pd.DataFrame:
216
+ """Extracts active records completely missing from the incoming batch.
217
+
218
+ Args:
219
+ df_exist_idx(pd.DataFrame): Active existing data indexed by keys.
220
+ preserved_keys(pd.Index): Keys missing from the new batch.
221
+
222
+ Returns:
223
+ pd.DataFrame: Existing active rows that must be preserved.
224
+ """
225
+ return df_exist_idx.loc[preserved_keys].reset_index()
File without changes
@@ -0,0 +1,283 @@
1
+ """
2
+ This module implements Slowly Changing Dimension Type 2 (SCD2) logic using
3
+ PySpark.
4
+ """
5
+
6
+ from typing import List
7
+
8
+ from pyspark.sql import Column, DataFrame
9
+ from pyspark.sql import functions as F
10
+
11
+
12
+ class SparkSCD2Processor:
13
+ """Handles Slowly Changing Dimension Type 2 (SCD2) logic using PySpark.
14
+
15
+ This class encapsulates configuration parameters such as keys and metadata
16
+ columns, allowing clean and repeatable executions of SCD2 updates.
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ business_keys: List[str],
22
+ compare_columns: List[str],
23
+ effective_date_col: str = "effective_date",
24
+ end_date_col: str = "end_date",
25
+ current_flag_col: str = "is_current",
26
+ input_timestamp_col: str = "input_timestamp",
27
+ max_date: str = "9999-12-31",
28
+ ):
29
+ """Initializes the SparkSCD2Processor with configuration parameters.
30
+
31
+ Args:
32
+ business_keys(List[str]): List of columns that uniquely identify
33
+ a record.
34
+ compare_columns(List[str]): List of columns used to detect
35
+ changes in the data.
36
+ effective_date_col(str): Name of the effective start date column.
37
+ Defaults to "effective_date".
38
+ end_date_col(str): Name of the effective end date column.
39
+ Defaults to "end_date".
40
+ current_flag_col(str): Name of the boolean flag column
41
+ indicating the active record. Defaults to "is_current".
42
+ input_timestamp_col(str): Name of the column containing the
43
+ ingestion/mutation timestamp. Defaults to "input_timestamp".
44
+ max_date(str): The maximum date string used to populate active
45
+ records. Defaults to "9999-12-31".
46
+
47
+ Returns:
48
+ None: This method initializes the class instance.
49
+ """
50
+ self.business_keys = business_keys
51
+ self.compare_columns = compare_columns
52
+ self.effective_date_col = effective_date_col
53
+ self.end_date_col = end_date_col
54
+ self.current_flag_col = current_flag_col
55
+ self.input_timestamp_col = input_timestamp_col
56
+ self.max_date = max_date
57
+
58
+ self._new_alias = "new"
59
+ self._exist_alias = "exist"
60
+
61
+ def process(
62
+ self, spark_df_new: DataFrame, spark_df_existing: DataFrame
63
+ ) -> DataFrame:
64
+ """Orchestrates the SCD2 pipeline process and returns result.
65
+
66
+ Args:
67
+ spark_df_new(DataFrame): Incoming batch of data.
68
+ spark_df_existing(DataFrame): Current state of dimension.
69
+
70
+ Returns:
71
+ DataFrame: Unified DataFrame with all processed records.
72
+ """
73
+ target_columns = spark_df_existing.columns
74
+
75
+ df_new_prepared = self._prepare_new_df(spark_df_new)
76
+ df_joined = self._join_datasets(df_new_prepared, spark_df_existing)
77
+
78
+ change_cond = self._build_change_condition()
79
+ unchanged_cond = self._build_unchanged_condition()
80
+
81
+ df_new_only = self._get_new_records(df_joined, target_columns)
82
+ df_changed = self._get_changed_records(df_joined, change_cond, target_columns)
83
+ df_unchanged = self._get_unchanged_records(
84
+ df_joined, unchanged_cond, target_columns
85
+ )
86
+ df_expired = self._get_expired_records(df_joined, change_cond, target_columns)
87
+ df_preserved = self._get_preserved_records(
88
+ spark_df_new, spark_df_existing, target_columns
89
+ )
90
+
91
+ return (
92
+ df_new_only.unionByName(df_changed)
93
+ .unionByName(df_expired)
94
+ .unionByName(df_preserved)
95
+ .unionByName(df_unchanged)
96
+ )
97
+
98
+ def _prepare_new_df(self, df: DataFrame) -> DataFrame:
99
+ """Prepares the incoming new dataset by injecting SCD2 metadata.
100
+
101
+ Args:
102
+ df(DataFrame): The raw incoming PySpark DataFrame.
103
+
104
+ Returns:
105
+ DataFrame: PySpark DataFrame with SCD2 columns added.
106
+ """
107
+ return (
108
+ df.withColumn(self.effective_date_col, F.col(self.input_timestamp_col))
109
+ .withColumn(self.end_date_col, F.lit(self.max_date))
110
+ .withColumn(self.current_flag_col, F.lit(True))
111
+ )
112
+
113
+ def _join_datasets(self, df_new: DataFrame, df_existing: DataFrame) -> DataFrame:
114
+ """Performs a left join between new data and historical dimension.
115
+
116
+ Args:
117
+ df_new(DataFrame): Prepared incoming DataFrame.
118
+ df_existing(DataFrame): Historical dimension table.
119
+
120
+ Returns:
121
+ DataFrame: Joined PySpark DataFrame.
122
+ """
123
+ join_condition = [
124
+ F.col(f"{self._new_alias}.{col}") == F.col(f"{self._exist_alias}.{col}")
125
+ for col in self.business_keys
126
+ ]
127
+ return df_new.alias(self._new_alias).join(
128
+ df_existing.alias(self._exist_alias),
129
+ on=join_condition,
130
+ how="left",
131
+ )
132
+
133
+ def _build_change_condition(self) -> Column:
134
+ """Builds expression to identify mismatches across comparison columns.
135
+
136
+ Args:
137
+
138
+ Returns:
139
+ Column: PySpark Column representing the logical OR condition.
140
+ """
141
+ if not self.compare_columns:
142
+ raise ValueError(
143
+ "compare_columns cannot be empty for SCD2 processing."
144
+ )
145
+
146
+ # Inicializamos con la primera columna para asegurar un tipo Column
147
+ first_col = self.compare_columns[0]
148
+ change_condition = F.col(f"{self._new_alias}.{first_col}") != F.col(
149
+ f"{self._exist_alias}.{first_col}"
150
+ )
151
+
152
+ # Iteramos sobre el resto de las columnas usando el operador |= (OR)
153
+ for col in self.compare_columns[1:]:
154
+ condition = F.col(f"{self._new_alias}.{col}") != F.col(
155
+ f"{self._exist_alias}.{col}"
156
+ )
157
+ change_condition |= condition
158
+
159
+ return change_condition
160
+
161
+ def _build_unchanged_condition(self) -> Column:
162
+ """Builds expression to verify that columns are strictly identical.
163
+
164
+ Args:
165
+
166
+ Returns:
167
+ Column: PySpark Column representing the logical AND condition.
168
+ """
169
+ unchanged_condition = F.lit(True)
170
+ for col in self.compare_columns:
171
+ unchanged_condition &= F.col(f"{self._new_alias}.{col}") == F.col(
172
+ f"{self._exist_alias}.{col}"
173
+ )
174
+ return unchanged_condition
175
+
176
+ def _get_new_records(self, df_joined: DataFrame, columns: List[str]) -> DataFrame:
177
+ """Extracts records whose business keys do not exist in dimension.
178
+
179
+ Args:
180
+ df_joined(DataFrame): The combined/joined PySpark DataFrame.
181
+ columns(List[str]): List of output columns required.
182
+
183
+ Returns:
184
+ DataFrame: PySpark DataFrame containing brand new rows.
185
+ """
186
+ key_col = f"{self._exist_alias}.{self.business_keys[0]}"
187
+ select_exprs = [F.col(f"{self._new_alias}.{c}").alias(c) for c in columns]
188
+ return df_joined.filter(F.col(key_col).isNull()).select(select_exprs)
189
+
190
+ def _get_changed_records(
191
+ self, df_joined: DataFrame, change_cond: Column, columns: List[str]
192
+ ) -> DataFrame:
193
+ """Extracts newest version of records that suffered changes.
194
+
195
+ Args:
196
+ df_joined(DataFrame): The combined/joined PySpark DataFrame.
197
+ change_cond(Column): PySpark Column condition for changes.
198
+ columns(List[str]): List of output columns required.
199
+
200
+ Returns:
201
+ DataFrame: PySpark DataFrame containing active updated records.
202
+ """
203
+ is_current_cond = F.col(f"{self._exist_alias}.{self.current_flag_col}")
204
+ select_exprs = [F.col(f"{self._new_alias}.{c}").alias(c) for c in columns]
205
+ return (
206
+ df_joined.filter(is_current_cond).filter(change_cond).select(select_exprs)
207
+ )
208
+
209
+ def _get_unchanged_records(
210
+ self, df_joined: DataFrame, unchanged_cond: Column, columns: List[str]
211
+ ) -> DataFrame:
212
+ """Extracts existing active records that have no changes.
213
+
214
+ Args:
215
+ df_joined(DataFrame): The combined/joined PySpark DataFrame.
216
+ unchanged_cond(Column): PySpark Column condition for identity.
217
+ columns(List[str]): List of output columns required.
218
+
219
+ Returns:
220
+ DataFrame: PySpark DataFrame containing untouched active rows.
221
+ """
222
+ is_current_cond = F.col(f"{self._exist_alias}.{self.current_flag_col}")
223
+ select_exprs = [F.col(f"{self._exist_alias}.{c}").alias(c) for c in columns]
224
+ return (
225
+ df_joined.filter(is_current_cond)
226
+ .filter(unchanged_cond)
227
+ .select(select_exprs)
228
+ )
229
+
230
+ def _get_expired_records(
231
+ self, df_joined: DataFrame, change_cond: Column, columns: List[str]
232
+ ) -> DataFrame:
233
+ """Transforms existing active rows into historically closed versions.
234
+
235
+ Args:
236
+ df_joined(DataFrame): The combined/joined PySpark DataFrame.
237
+ change_cond(Column): PySpark Column condition for changes.
238
+ columns(List[str]): List of output columns required.
239
+
240
+ Returns:
241
+ DataFrame: PySpark DataFrame containing expired records.
242
+ """
243
+ is_current_cond = F.col(f"{self._exist_alias}.{self.current_flag_col}")
244
+
245
+ select_exprs = []
246
+ for c in columns:
247
+ if c not in [self.end_date_col, self.current_flag_col]:
248
+ select_exprs.append(F.col(f"{self._exist_alias}.{c}").alias(c))
249
+ elif c == self.end_date_col:
250
+ new_ts_col = f"{self._new_alias}.{self.input_timestamp_col}"
251
+ select_exprs.append(F.col(new_ts_col).alias(self.end_date_col))
252
+ else:
253
+ select_exprs.append(F.lit(False).alias(self.current_flag_col))
254
+
255
+ return (
256
+ df_joined.filter(is_current_cond).filter(change_cond).select(select_exprs)
257
+ )
258
+
259
+ def _get_preserved_records(
260
+ self, df_new: DataFrame, df_existing: DataFrame, columns: List[str]
261
+ ) -> DataFrame:
262
+ """Preserves active records missing from the new batch.
263
+
264
+ Args:
265
+ df_new(DataFrame): The raw incoming PySpark DataFrame.
266
+ df_existing(DataFrame): The existing dimension table.
267
+ columns(List[str]): List of output columns required.
268
+
269
+ Returns:
270
+ DataFrame: PySpark DataFrame containing unaffected rows.
271
+ """
272
+ is_current_cond = F.col(f"{self._exist_alias}.{self.current_flag_col}")
273
+ select_exprs = [F.col(f"{self._exist_alias}.{c}").alias(c) for c in columns]
274
+ return (
275
+ df_existing.alias(self._exist_alias)
276
+ .join(
277
+ df_new.alias(self._new_alias),
278
+ on=self.business_keys,
279
+ how="left_anti",
280
+ )
281
+ .filter(is_current_cond)
282
+ .select(select_exprs)
283
+ )
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Tu Nombre Completo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,107 @@
1
+ Metadata-Version: 2.3
2
+ Name: data-engineering-exp
3
+ Version: 0.1.0
4
+ Summary: A minimalist, agnostic Python framework to standardize data engineering pipelines.
5
+ License: MIT
6
+ Keywords: data-engineering,pyspark,pandas,data-catalog
7
+ Author: idperez720
8
+ Author-email: ivandavidperez4@gmail.com
9
+ Requires-Python: >=3.11,<4.0.0
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Dist: click (>=8.4.1,<9.0.0)
17
+ Requires-Dist: pandas (>=3.0.3,<4.0.0)
18
+ Requires-Dist: pyspark (>=4.1.2,<5.0.0)
19
+ Requires-Dist: pyyaml (>=6.0.3,<7.0.0)
20
+ Project-URL: Repository, https://github.com/idperez720/data-engineering-exp
21
+ Description-Content-Type: text/markdown
22
+
23
+ # Data Engineering Experience šŸš€
24
+
25
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
26
+ [![Python Version](https://img.shields.io/badge/python-3.11%20%7C%203.12%20%7C%203.14-blue)](https://www.python.org)
27
+
28
+ **dex** (*Data Engineering Experience*) is a minimalist, agnostic Python framework designed to streamline and standardize data engineering pipelines. By embracing **Convention over Configuration**, `dex` eliminates environment friction, absolute path hardcoding, and complex PySpark session management.
29
+
30
+ ---
31
+
32
+ ## ✨ Key Features
33
+
34
+ * **Zero-Config File Discovery:** Automatic tree-walking directory resolution anchors your data catalog using your local `pyproject.toml` file.
35
+ * **Decentralized Catalog:** Declare your metadata layouts inside modular, self-contained mini-YAML files.
36
+ * **Elastic Processing Runtimes:** Switch dynamically between **Pandas** and **PySpark** execution engines using exactly the same unified interface.
37
+ * **Interactive CLI Scaffolding:** Spin up a new production-ready data directory structure instantly with `dex init`.
38
+
39
+ ---
40
+
41
+ ## šŸ“¦ Installation
42
+
43
+ *(Once published to PyPI)*
44
+ ```bash
45
+ pip install dex
46
+ ```
47
+ Or install it directly from the source repository using Poetry:
48
+ ```bash
49
+ poetry add git+[https://github.com/idperez720/data-engineering-exp.git](https://github.com/idperez720/data-engineering-exp.git)
50
+ ```
51
+
52
+ ---
53
+
54
+ ## šŸ Quick Start
55
+
56
+ ### 1. Initialize your workspace
57
+
58
+ Navigate to an empty directory and let the interactive wizard scaffold the workspace conventions:
59
+
60
+ ```bash
61
+ dex init
62
+
63
+ ```
64
+
65
+ ### 2. Declare a dataset
66
+
67
+ Add a specification block inside `conf/catalog/sample_dataset.yaml`:
68
+
69
+ ```yaml
70
+ customers:
71
+ description: "Main production customer data"
72
+ format: "csv"
73
+ engine: "pandas"
74
+ storage_path: "data/sample_table.csv"
75
+
76
+ ```
77
+
78
+ ### 3. Load data anywhere
79
+
80
+ Create a Python script or open a Jupyter Notebook inside `src/notebooks/` and fetch your data instantly:
81
+
82
+ ```python
83
+ from data_engineering_exp.core.io import DataLoader
84
+
85
+ # Autodiscovers your project root boundaries and settings
86
+ loader = DataLoader()
87
+
88
+ # Loads the dataset securely as a Pandas DataFrame
89
+ df = loader.load("customers")
90
+ df.head()
91
+
92
+ ```
93
+
94
+ ---
95
+
96
+ ## šŸ“– Complete Documentation
97
+
98
+ For comprehensive guides, testing architecture deep-dives, and complete API references, visit our documentation site:
99
+ šŸ‘‰ **[http://127.0.0.1:8000/](https://www.google.com/search?q=http://127.0.0.1:8000/)** *(Replace with your deployed docs URL, e.g., GitHub Pages)*
100
+
101
+ ---
102
+
103
+ ## āš–ļø License
104
+
105
+ Distributed under the **MIT License**. Any modification or distribution (including forks) must include the original copyright notice and liability waiver. See `LICENSE` for more information.
106
+
107
+ ```
@@ -0,0 +1,16 @@
1
+ data_engineering_exp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ data_engineering_exp/cli.py,sha256=KNQ8Ei4_wU-MTEH_pca3cOhwQJ0rV0wAZ3FwHtWuht8,1783
3
+ data_engineering_exp/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ data_engineering_exp/core/catalog.py,sha256=wu4QnsHus9QJi6dhQR_Eu4-elxAY-dTEor2goZpuE-U,7226
5
+ data_engineering_exp/core/deduplication.py,sha256=aLJrJtVQG56wI39ImSKvsIaq1U36hH7_-t4CjB3fwGw,6472
6
+ data_engineering_exp/core/initializer.py,sha256=Q0Kx7U5HxNu19pQYGvxxfy-1TVeFpHF4jMXKfji2i9c,3484
7
+ data_engineering_exp/core/io.py,sha256=VIzAEP5iQduUJ7OWV2K82iAdelpkPT9lk7NqU1BuRXA,4825
8
+ data_engineering_exp/pandas_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ data_engineering_exp/pandas_core/scd2.py,sha256=KGeRYVv2ClxEoO9WUEVfF5Bdll3l-clJmIqYlT5EbZQ,8668
10
+ data_engineering_exp/spark_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ data_engineering_exp/spark_core/scd2.py,sha256=Zza0ePChxq9TV74OK5gr8odhUF7sDiX_lmoAS2qwyV4,10841
12
+ data_engineering_exp-0.1.0.dist-info/LICENSE,sha256=AVQrXW_hCAJSW_1PiMnJmic2hJ5aT9ypRcQEqIY4sdw,1074
13
+ data_engineering_exp-0.1.0.dist-info/METADATA,sha256=V3OeZlQeGXkdJdiuHzAgmrTQys6BJ0o7lgM7SG8W2a8,3607
14
+ data_engineering_exp-0.1.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
15
+ data_engineering_exp-0.1.0.dist-info/entry_points.txt,sha256=xHkLyRWvtD12WAnIZ4p0XnzRvhvr3Mt0wj5x8hcJ07A,60
16
+ data_engineering_exp-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.3
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ dex=data_engineering_exp.cli:entry_point
3
+