data-engineering-exp 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Tu Nombre Completo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,107 @@
1
+ Metadata-Version: 2.3
2
+ Name: data-engineering-exp
3
+ Version: 0.1.0
4
+ Summary: A minimalist, agnostic Python framework to standardize data engineering pipelines.
5
+ License: MIT
6
+ Keywords: data-engineering,pyspark,pandas,data-catalog
7
+ Author: idperez720
8
+ Author-email: ivandavidperez4@gmail.com
9
+ Requires-Python: >=3.11,<4.0.0
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Dist: click (>=8.4.1,<9.0.0)
17
+ Requires-Dist: pandas (>=3.0.3,<4.0.0)
18
+ Requires-Dist: pyspark (>=4.1.2,<5.0.0)
19
+ Requires-Dist: pyyaml (>=6.0.3,<7.0.0)
20
+ Project-URL: Repository, https://github.com/idperez720/data-engineering-exp
21
+ Description-Content-Type: text/markdown
22
+
23
+ # Data Engineering Experience 🚀
24
+
25
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
26
+ [![Python Version](https://img.shields.io/badge/python-3.11%20%7C%203.12%20%7C%203.14-blue)](https://www.python.org)
27
+
28
+ **dex** (*Data Engineering Experience*) is a minimalist, agnostic Python framework designed to streamline and standardize data engineering pipelines. By embracing **Convention over Configuration**, `dex` eliminates environment friction, absolute path hardcoding, and complex PySpark session management.
29
+
30
+ ---
31
+
32
+ ## ✨ Key Features
33
+
34
+ * **Zero-Config File Discovery:** Automatic tree-walking directory resolution anchors your data catalog using your local `pyproject.toml` file.
35
+ * **Decentralized Catalog:** Declare your metadata layouts inside modular, self-contained mini-YAML files.
36
+ * **Elastic Processing Runtimes:** Switch dynamically between **Pandas** and **PySpark** execution engines using exactly the same unified interface.
37
+ * **Interactive CLI Scaffolding:** Spin up a new production-ready data directory structure instantly with `dex init`.
38
+
39
+ ---
40
+
41
+ ## 📦 Installation
42
+
43
+ *(Once published to PyPI)*
44
+ ```bash
45
+ pip install dex
46
+ ```
47
+ Or install it directly from the source repository using Poetry:
48
+ ```bash
49
+ poetry add git+[https://github.com/idperez720/data-engineering-exp.git](https://github.com/idperez720/data-engineering-exp.git)
50
+ ```
51
+
52
+ ---
53
+
54
+ ## 🏁 Quick Start
55
+
56
+ ### 1. Initialize your workspace
57
+
58
+ Navigate to an empty directory and let the interactive wizard scaffold the workspace conventions:
59
+
60
+ ```bash
61
+ dex init
62
+
63
+ ```
64
+
65
+ ### 2. Declare a dataset
66
+
67
+ Add a specification block inside `conf/catalog/sample_dataset.yaml`:
68
+
69
+ ```yaml
70
+ customers:
71
+ description: "Main production customer data"
72
+ format: "csv"
73
+ engine: "pandas"
74
+ storage_path: "data/sample_table.csv"
75
+
76
+ ```
77
+
78
+ ### 3. Load data anywhere
79
+
80
+ Create a Python script or open a Jupyter Notebook inside `src/notebooks/` and fetch your data instantly:
81
+
82
+ ```python
83
+ from data_engineering_exp.core.io import DataLoader
84
+
85
+ # Autodiscovers your project root boundaries and settings
86
+ loader = DataLoader()
87
+
88
+ # Loads the dataset securely as a Pandas DataFrame
89
+ df = loader.load("customers")
90
+ df.head()
91
+
92
+ ```
93
+
94
+ ---
95
+
96
+ ## 📖 Complete Documentation
97
+
98
+ For comprehensive guides, testing architecture deep-dives, and complete API references, visit our documentation site:
99
+ 👉 **[http://127.0.0.1:8000/](https://www.google.com/search?q=http://127.0.0.1:8000/)** *(Replace with your deployed docs URL, e.g., GitHub Pages)*
100
+
101
+ ---
102
+
103
+ ## ⚖️ License
104
+
105
+ Distributed under the **MIT License**. Any modification or distribution (including forks) must include the original copyright notice and liability waiver. See `LICENSE` for more information.
106
+
107
+ ```
@@ -0,0 +1,85 @@
1
+ # Data Engineering Experience 🚀
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
4
+ [![Python Version](https://img.shields.io/badge/python-3.11%20%7C%203.12%20%7C%203.14-blue)](https://www.python.org)
5
+
6
+ **dex** (*Data Engineering Experience*) is a minimalist, agnostic Python framework designed to streamline and standardize data engineering pipelines. By embracing **Convention over Configuration**, `dex` eliminates environment friction, absolute path hardcoding, and complex PySpark session management.
7
+
8
+ ---
9
+
10
+ ## ✨ Key Features
11
+
12
+ * **Zero-Config File Discovery:** Automatic tree-walking directory resolution anchors your data catalog using your local `pyproject.toml` file.
13
+ * **Decentralized Catalog:** Declare your metadata layouts inside modular, self-contained mini-YAML files.
14
+ * **Elastic Processing Runtimes:** Switch dynamically between **Pandas** and **PySpark** execution engines using exactly the same unified interface.
15
+ * **Interactive CLI Scaffolding:** Spin up a new production-ready data directory structure instantly with `dex init`.
16
+
17
+ ---
18
+
19
+ ## 📦 Installation
20
+
21
+ *(Once published to PyPI)*
22
+ ```bash
23
+ pip install dex
24
+ ```
25
+ Or install it directly from the source repository using Poetry:
26
+ ```bash
27
+ poetry add git+[https://github.com/idperez720/data-engineering-exp.git](https://github.com/idperez720/data-engineering-exp.git)
28
+ ```
29
+
30
+ ---
31
+
32
+ ## 🏁 Quick Start
33
+
34
+ ### 1. Initialize your workspace
35
+
36
+ Navigate to an empty directory and let the interactive wizard scaffold the workspace conventions:
37
+
38
+ ```bash
39
+ dex init
40
+
41
+ ```
42
+
43
+ ### 2. Declare a dataset
44
+
45
+ Add a specification block inside `conf/catalog/sample_dataset.yaml`:
46
+
47
+ ```yaml
48
+ customers:
49
+ description: "Main production customer data"
50
+ format: "csv"
51
+ engine: "pandas"
52
+ storage_path: "data/sample_table.csv"
53
+
54
+ ```
55
+
56
+ ### 3. Load data anywhere
57
+
58
+ Create a Python script or open a Jupyter Notebook inside `src/notebooks/` and fetch your data instantly:
59
+
60
+ ```python
61
+ from data_engineering_exp.core.io import DataLoader
62
+
63
+ # Autodiscovers your project root boundaries and settings
64
+ loader = DataLoader()
65
+
66
+ # Loads the dataset securely as a Pandas DataFrame
67
+ df = loader.load("customers")
68
+ df.head()
69
+
70
+ ```
71
+
72
+ ---
73
+
74
+ ## 📖 Complete Documentation
75
+
76
+ For comprehensive guides, testing architecture deep-dives, and complete API references, visit our documentation site:
77
+ 👉 **[http://127.0.0.1:8000/](https://www.google.com/search?q=http://127.0.0.1:8000/)** *(Replace with your deployed docs URL, e.g., GitHub Pages)*
78
+
79
+ ---
80
+
81
+ ## ⚖️ License
82
+
83
+ Distributed under the **MIT License**. Any modification or distribution (including forks) must include the original copyright notice and liability waiver. See `LICENSE` for more information.
84
+
85
+ ```
@@ -0,0 +1,63 @@
1
+ """Command Line Interface (CLI) entry point for dex using Click."""
2
+
3
+ import os
4
+
5
+ import click
6
+
7
+ from data_engineering_exp.core.initializer import ProjectInitializer
8
+
9
+
10
+ @click.group()
11
+ def entry_point() -> None:
12
+ """dex - Data Engineering Experience CLI utilities.
13
+
14
+ Args:
15
+
16
+ Returns:
17
+ None: Main CLI entry group.
18
+ """
19
+ pass
20
+
21
+
22
+ @entry_point.command()
23
+ @click.option(
24
+ "--path",
25
+ type=click.Path(file_okay=False, dir_okay=True),
26
+ default=".",
27
+ help="Root directory where scaffolding will be built. Defaults to '.'",
28
+ )
29
+ def init(path: str) -> None:
30
+ """Scaffold a new data engineering project structure interactively.
31
+
32
+ Args:
33
+ path(str): Target directory string path for project structure.
34
+
35
+ Returns:
36
+ None: Side-effect creating folders and configurations.
37
+ """
38
+ click.echo("🚀 Welcome to the dex project initialization wizard!\n")
39
+
40
+ # Derives default project name dynamically from the target directory name
41
+ default_name = os.path.basename(os.path.abspath(path))
42
+ if not default_name or default_name in (".", ""):
43
+ default_name = "my-dex-project"
44
+
45
+ # Sequential metadata prompts with default fallbacks
46
+ name = click.prompt("Project name", default=default_name, type=str)
47
+ version = click.prompt("Version", default="0.1.0", type=str)
48
+ description = click.prompt(
49
+ "Description",
50
+ default="Data engineering project scaffolded by dex",
51
+ type=str,
52
+ )
53
+ author = click.prompt("Author", default="Anonymous", type=str)
54
+
55
+ initializer = ProjectInitializer(base_path=path)
56
+ initializer.init_project(
57
+ name=name,
58
+ version=version,
59
+ description=description,
60
+ author=author,
61
+ )
62
+
63
+ click.echo(f"\n✨ Project successfully initialized at '{path}'!")
@@ -0,0 +1,197 @@
1
+ """This module implements decentralized data catalog engines for dex."""
2
+
3
+ import os
4
+ from typing import Any, Dict, List, Optional, cast
5
+
6
+ import yaml
7
+
8
+
9
+ class DataCatalog:
10
+ """Parses and consolidates declarative configurations from files or directories.
11
+
12
+ Scans catalog definitions recursively, allowing user architectures to be
13
+ split into mini-YAML files or kept within a single configuration layout.
14
+ """
15
+
16
+ def __init__(self, catalog_path: Optional[str] = None) -> None:
17
+ """Initializes the DataCatalog by scanning files or configuration paths.
18
+
19
+ If no path is provided, it automatically seeks upwards from the current
20
+ working directory to find the standard 'conf/catalog' directory based on
21
+ the pyproject.toml root anchor file.
22
+
23
+ Args:
24
+ catalog_path(Optional[str]): System directory path or file string
25
+ pointing to catalog definitions. Defaults to None.
26
+
27
+ Returns:
28
+ None: Initializes the class instance.
29
+ """
30
+ self.project_root: str = ""
31
+
32
+ if catalog_path is None:
33
+ catalog_path = self._discover_catalog_path()
34
+ else:
35
+ # Derive the root directory context if an explicit path is supplied
36
+ abs_path = os.path.abspath(catalog_path)
37
+ if os.path.isfile(abs_path):
38
+ self.project_root = os.path.dirname(
39
+ os.path.dirname(os.path.dirname(abs_path))
40
+ )
41
+ else:
42
+ self.project_root = os.path.dirname(os.path.dirname(abs_path))
43
+
44
+ if not os.path.exists(catalog_path):
45
+ raise FileNotFoundError(f"Catalog source path not found at: {catalog_path}")
46
+
47
+ self._datasets: Dict[str, Dict[str, Any]] = {}
48
+ self._load_catalog_sources(catalog_path)
49
+
50
+ @property
51
+ def dataset_names(self) -> List[str]:
52
+ """Exposes the list of all dataset identifiers present in the catalog.
53
+
54
+ Args:
55
+
56
+ Returns:
57
+ List[str]: List of string names for all loaded datasets.
58
+ """
59
+ return list(self._datasets.keys())
60
+
61
+ def get_dataset_metadata(self, dataset_name: str) -> Dict[str, Any]:
62
+ """Retrieves core metadata mapping configurations for a target dataset.
63
+
64
+ Args:
65
+ dataset_name(str): Unique identifier string of the dataset.
66
+
67
+ Returns:
68
+ Dict[str, Any]: Dictionary containing properties like description,
69
+ format, primary_keys, and storage_path.
70
+
71
+ Raises:
72
+ KeyError: If the target dataset name is missing from the catalog.
73
+ """
74
+ if dataset_name not in self._datasets:
75
+ raise KeyError(f"Dataset '{dataset_name}' missing from catalog.")
76
+
77
+ meta = self._datasets[dataset_name].copy()
78
+ meta.pop("columns", None)
79
+ return meta
80
+
81
+ def get_column_names(self, dataset_name: str) -> List[str]:
82
+ """Extracts the expected list of column names for a target dataset.
83
+
84
+ Args:
85
+ dataset_name(str): Unique identifier string of the dataset.
86
+
87
+ Returns:
88
+ List[str]: Ordered list of strings representing expected columns.
89
+
90
+ Raises:
91
+ KeyError: If the target dataset name is missing from the catalog.
92
+ """
93
+ if dataset_name not in self._datasets:
94
+ raise KeyError(f"Dataset '{dataset_name}' missing from catalog.")
95
+
96
+ columns_spec = self._datasets[dataset_name].get("columns", [])
97
+ return [col["name"] for col in columns_spec]
98
+
99
+ def validate_schema_presence(self, df: Any, dataset_name: str) -> bool:
100
+ """Validates that all columns defined in the catalog exist in the dataframe.
101
+
102
+ Args:
103
+ df(Any): Active input Pandas or PySpark DataFrame object.
104
+ dataset_name(str): Catalog identifier string of the dataset target.
105
+
106
+ Returns:
107
+ bool: True if all expected columns are present, raises ValueError
108
+ otherwise.
109
+
110
+ Raises:
111
+ TypeError: If the input dataframe structure is not supported.
112
+ ValueError: If a column mismatch is discovered against the catalog.
113
+ """
114
+ expected_cols = set(self.get_column_names(dataset_name))
115
+
116
+ df_type = type(df).__name__
117
+ if df_type == "DataFrame" and "pandas" in type(df).__module__:
118
+ actual_cols = set(df.columns.tolist())
119
+ elif df_type == "DataFrame" and "pyspark" in type(df).__module__:
120
+ actual_cols = set(df.columns)
121
+ else:
122
+ raise TypeError("Unsupported DataFrame type for validation.")
123
+
124
+ missing_cols = expected_cols - actual_cols
125
+ if missing_cols:
126
+ raise ValueError(
127
+ f"Schema mismatch for '{dataset_name}'. "
128
+ f"Missing expected catalog columns: {list(missing_cols)}"
129
+ )
130
+
131
+ return True
132
+
133
+ def _discover_catalog_path(self) -> str:
134
+ """Traverses parent directories upwards to locate the pyproject.toml file.
135
+
136
+ Once the layout anchor file is found, it automatically appends the standard
137
+ 'conf/catalog' structural workspace mapping.
138
+
139
+ Returns:
140
+ str: Absolute path to the discovered catalog directory.
141
+
142
+ Raises:
143
+ FileNotFoundError: If the pyproject.toml configuration cannot be found.
144
+ """
145
+ current_dir = os.getcwd()
146
+
147
+ while True:
148
+ potential_toml = os.path.join(current_dir, "pyproject.toml")
149
+ if os.path.exists(potential_toml):
150
+ self.project_root = current_dir
151
+ return os.path.join(current_dir, "conf", "catalog")
152
+
153
+ parent_dir = os.path.dirname(current_dir)
154
+ if parent_dir == current_dir:
155
+ break
156
+ current_dir = parent_dir
157
+
158
+ raise FileNotFoundError(
159
+ "Configuration file (pyproject.toml) could not be located. "
160
+ "Please run 'dex init' to establish a valid project layout."
161
+ )
162
+
163
+ def _load_catalog_sources(self, path: str) -> None:
164
+ """Internal worker to process and parse path targets recursively.
165
+
166
+ Args:
167
+ path(str): Targeting file path or folder configuration directory.
168
+
169
+ Returns:
170
+ None: Populates internal dictionary instances.
171
+ """
172
+ if os.path.isfile(path):
173
+ if path.endswith((".yml", ".yaml")):
174
+ self._parse_file(path)
175
+ return
176
+
177
+ for root, _, files in os.walk(path):
178
+ for file in files:
179
+ if file.endswith((".yml", ".yaml")):
180
+ full_path = os.path.join(root, file)
181
+ self._parse_file(full_path)
182
+
183
+ def _parse_file(self, file_path: str) -> None:
184
+ """Parses an individual YAML catalog file and updates target states.
185
+
186
+ Args:
187
+ file_path(str): Exact system string location pointing to file.
188
+
189
+ Returns:
190
+ None: Updates the core datasets dictionary data states.
191
+ """
192
+ with open(file_path, "r", encoding="utf-8") as stream:
193
+ content = yaml.safe_load(stream)
194
+
195
+ if content and isinstance(content, dict):
196
+ typed_content = cast(Dict[str, Dict[str, Any]], content)
197
+ self._datasets.update(typed_content)
@@ -0,0 +1,192 @@
1
+ """This module implements versatile data deduplication utilities for dex."""
2
+
3
+ from typing import Any, List, Optional, Union
4
+
5
+ # Handle optional dependencies at the top of the file
6
+ try:
7
+ import pyspark.sql.functions as F
8
+ from pyspark.sql import Window
9
+
10
+ HAS_SPARK = True
11
+ except ImportError:
12
+ Window: Any = None
13
+ F: Any = None
14
+ HAS_SPARK = False
15
+
16
+
17
+ class Deduplicator:
18
+ """Utility class providing multiple strategies for data deduplication.
19
+
20
+ Supports Pandas and PySpark DataFrames dynamically, allowing extraction of
21
+ first, latest, multi-sorted, consolidated, or strictly unique records.
22
+ """
23
+
24
+ def __init__(self):
25
+ """Initializes the Deduplicator instance.
26
+
27
+ Args:
28
+
29
+ Returns:
30
+ None: Initializes the class instance.
31
+ """
32
+ pass
33
+
34
+ def latest(self, df: Any, keys: List[str], order_by_col: str) -> Any:
35
+ """Extracts the most recent record per business key group.
36
+
37
+ Args:
38
+ df(Any): Input Pandas or PySpark DataFrame.
39
+ keys(List[str]): Columns that identify a business entity.
40
+ order_by_col(str): Column used to determine chronological order.
41
+
42
+ Returns:
43
+ Any: Deduplicated DataFrame containing only the latest events.
44
+ """
45
+ framework = self._detect_framework(df)
46
+ if framework == "pandas":
47
+ return (
48
+ df.sort_values(by=order_by_col, ascending=True)
49
+ .groupby(keys)
50
+ .last()
51
+ .reset_index()
52
+ )
53
+
54
+ window_spec = Window.partitionBy(keys).orderBy(F.col(order_by_col).desc())
55
+ return (
56
+ df.withColumn("_row_num", F.row_number().over(window_spec))
57
+ .filter(F.col("_row_num") == 1)
58
+ .drop("_row_num")
59
+ )
60
+
61
+ def first(self, df: Any, keys: List[str], order_by_col: str) -> Any:
62
+ """Extracts the earliest chronological record per business key group.
63
+
64
+ Args:
65
+ df(Any): Input Pandas or PySpark DataFrame.
66
+ keys(List[str]): Columns that identify a business entity.
67
+ order_by_col(str): Column used to determine chronological order.
68
+
69
+ Returns:
70
+ Any: Deduplicated DataFrame containing only the first events.
71
+ """
72
+ framework = self._detect_framework(df)
73
+ if framework == "pandas":
74
+ return (
75
+ df.sort_values(by=order_by_col, ascending=True)
76
+ .groupby(keys)
77
+ .first()
78
+ .reset_index()
79
+ )
80
+
81
+ window_spec = Window.partitionBy(keys).orderBy(F.col(order_by_col).asc())
82
+ return (
83
+ df.withColumn("_row_num", F.row_number().over(window_spec))
84
+ .filter(F.col("_row_num") == 1)
85
+ .drop("_row_num")
86
+ )
87
+
88
+ def distinct(self, df: Any, subset: Optional[List[str]] = None) -> Any:
89
+ """Removes strict duplicate rows from the DataFrame.
90
+
91
+ Args:
92
+ df(Any): Input Pandas or PySpark DataFrame.
93
+ subset(Optional[List[str]]): Specific columns to consider when
94
+ identifying duplicates. Defaults to None (all columns).
95
+
96
+ Returns:
97
+ Any: Clean DataFrame with unique records only.
98
+ """
99
+ framework = self._detect_framework(df)
100
+ if framework == "pandas":
101
+ return df.drop_duplicates(subset=subset).reset_index(drop=True)
102
+
103
+ return df.dropDuplicates(subset=subset)
104
+
105
+ def by_order(
106
+ self,
107
+ df: Any,
108
+ keys: List[str],
109
+ order_by_cols: List[str],
110
+ ascending: Union[bool, List[bool]] = True,
111
+ ) -> Any:
112
+ """Deduplicates rows by sorting through multiple custom column criteria.
113
+
114
+ Args:
115
+ df(Any): Input Pandas or PySpark DataFrame.
116
+ keys(List[str]): Columns that identify a business entity.
117
+ order_by_cols(List[str]): Ordered list of columns to sort by.
118
+ ascending(Union[bool, List[bool]]): Type of sorting direction for
119
+ each column. Defaults to True.
120
+
121
+ Returns:
122
+ Any: Deduplicated DataFrame containing the top-ranked records.
123
+ """
124
+ framework = self._detect_framework(df)
125
+ if framework == "pandas":
126
+ return (
127
+ df.sort_values(by=order_by_cols, ascending=ascending)
128
+ .groupby(keys)
129
+ .first()
130
+ .reset_index()
131
+ )
132
+
133
+ if isinstance(ascending, bool):
134
+ asc_list = [ascending] * len(order_by_cols)
135
+ else:
136
+ asc_list = ascending
137
+
138
+ order_exprs = []
139
+ for col, asc_dir in zip(order_by_cols, asc_list):
140
+ expr = F.col(col).asc() if asc_dir else F.col(col).desc()
141
+ order_exprs.append(expr)
142
+
143
+ window_spec = Window.partitionBy(keys).orderBy(*order_exprs)
144
+ return (
145
+ df.withColumn("_row_num", F.row_number().over(window_spec))
146
+ .filter(F.col("_row_num") == 1)
147
+ .drop("_row_num")
148
+ )
149
+
150
+ def combined(self, df: Any, keys: List[str]) -> Any:
151
+ """Stitches rows together by compacting nulls into a golden record.
152
+
153
+ Args:
154
+ df(Any): Input Pandas or PySpark DataFrame.
155
+ keys(List[str]): Columns that identify a business entity.
156
+
157
+ Returns:
158
+ Any: Consolidated DataFrame with filled attributes per group.
159
+ """
160
+ framework = self._detect_framework(df)
161
+ if framework == "pandas":
162
+ return df.groupby(keys).first().reset_index()
163
+
164
+ agg_cols = [
165
+ F.first(c, ignorenulls=True).alias(c) for c in df.columns if c not in keys
166
+ ]
167
+ return df.groupBy(keys).agg(*agg_cols)
168
+
169
+ def _detect_framework(self, df: Any) -> str:
170
+ """Internal helper to identify the dataframe framework type safely.
171
+
172
+ Args:
173
+ df(Any): Input dataframe object.
174
+
175
+ Returns:
176
+ str: String identifier ("pandas" or "spark").
177
+ """
178
+ df_type = type(df).__name__
179
+ module_type = type(df).__module__
180
+
181
+ if "pandas" in module_type and df_type == "DataFrame":
182
+ return "pandas"
183
+
184
+ if "pyspark" in module_type and df_type == "DataFrame":
185
+ if not HAS_SPARK:
186
+ raise ImportError("PySpark is required but could not be imported.")
187
+ return "spark"
188
+
189
+ raise TypeError(
190
+ f"Unsupported type: {type(df)}. "
191
+ "Only Pandas and PySpark DataFrames are supported."
192
+ )