data-engineering-exp 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_engineering_exp-0.1.0/LICENSE +21 -0
- data_engineering_exp-0.1.0/PKG-INFO +107 -0
- data_engineering_exp-0.1.0/README.md +85 -0
- data_engineering_exp-0.1.0/data_engineering_exp/__init__.py +0 -0
- data_engineering_exp-0.1.0/data_engineering_exp/cli.py +63 -0
- data_engineering_exp-0.1.0/data_engineering_exp/core/__init__.py +0 -0
- data_engineering_exp-0.1.0/data_engineering_exp/core/catalog.py +197 -0
- data_engineering_exp-0.1.0/data_engineering_exp/core/deduplication.py +192 -0
- data_engineering_exp-0.1.0/data_engineering_exp/core/initializer.py +93 -0
- data_engineering_exp-0.1.0/data_engineering_exp/core/io.py +134 -0
- data_engineering_exp-0.1.0/data_engineering_exp/pandas_core/__init__.py +0 -0
- data_engineering_exp-0.1.0/data_engineering_exp/pandas_core/scd2.py +225 -0
- data_engineering_exp-0.1.0/data_engineering_exp/spark_core/__init__.py +0 -0
- data_engineering_exp-0.1.0/data_engineering_exp/spark_core/scd2.py +283 -0
- data_engineering_exp-0.1.0/pyproject.toml +94 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Tu Nombre Completo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: data-engineering-exp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A minimalist, agnostic Python framework to standardize data engineering pipelines.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: data-engineering,pyspark,pandas,data-catalog
|
|
7
|
+
Author: idperez720
|
|
8
|
+
Author-email: ivandavidperez4@gmail.com
|
|
9
|
+
Requires-Python: >=3.11,<4.0.0
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Dist: click (>=8.4.1,<9.0.0)
|
|
17
|
+
Requires-Dist: pandas (>=3.0.3,<4.0.0)
|
|
18
|
+
Requires-Dist: pyspark (>=4.1.2,<5.0.0)
|
|
19
|
+
Requires-Dist: pyyaml (>=6.0.3,<7.0.0)
|
|
20
|
+
Project-URL: Repository, https://github.com/idperez720/data-engineering-exp
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# Data Engineering Experience 🚀
|
|
24
|
+
|
|
25
|
+
[](https://opensource.org/licenses/MIT)
|
|
26
|
+
[](https://www.python.org)
|
|
27
|
+
|
|
28
|
+
**dex** (*Data Engineering Experience*) is a minimalist, agnostic Python framework designed to streamline and standardize data engineering pipelines. By embracing **Convention over Configuration**, `dex` eliminates environment friction, absolute path hardcoding, and complex PySpark session management.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## ✨ Key Features
|
|
33
|
+
|
|
34
|
+
* **Zero-Config File Discovery:** Automatic tree-walking directory resolution anchors your data catalog using your local `pyproject.toml` file.
|
|
35
|
+
* **Decentralized Catalog:** Declare your metadata layouts inside modular, self-contained mini-YAML files.
|
|
36
|
+
* **Elastic Processing Runtimes:** Switch dynamically between **Pandas** and **PySpark** execution engines using exactly the same unified interface.
|
|
37
|
+
* **Interactive CLI Scaffolding:** Spin up a new production-ready data directory structure instantly with `dex init`.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## 📦 Installation
|
|
42
|
+
|
|
43
|
+
*(Once published to PyPI)*
|
|
44
|
+
```bash
|
|
45
|
+
pip install dex
|
|
46
|
+
```
|
|
47
|
+
Or install it directly from the source repository using Poetry:
|
|
48
|
+
```bash
|
|
49
|
+
poetry add git+[https://github.com/idperez720/data-engineering-exp.git](https://github.com/idperez720/data-engineering-exp.git)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## 🏁 Quick Start
|
|
55
|
+
|
|
56
|
+
### 1. Initialize your workspace
|
|
57
|
+
|
|
58
|
+
Navigate to an empty directory and let the interactive wizard scaffold the workspace conventions:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
dex init
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### 2. Declare a dataset
|
|
66
|
+
|
|
67
|
+
Add a specification block inside `conf/catalog/sample_dataset.yaml`:
|
|
68
|
+
|
|
69
|
+
```yaml
|
|
70
|
+
customers:
|
|
71
|
+
description: "Main production customer data"
|
|
72
|
+
format: "csv"
|
|
73
|
+
engine: "pandas"
|
|
74
|
+
storage_path: "data/sample_table.csv"
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### 3. Load data anywhere
|
|
79
|
+
|
|
80
|
+
Create a Python script or open a Jupyter Notebook inside `src/notebooks/` and fetch your data instantly:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from data_engineering_exp.core.io import DataLoader
|
|
84
|
+
|
|
85
|
+
# Autodiscovers your project root boundaries and settings
|
|
86
|
+
loader = DataLoader()
|
|
87
|
+
|
|
88
|
+
# Loads the dataset securely as a Pandas DataFrame
|
|
89
|
+
df = loader.load("customers")
|
|
90
|
+
df.head()
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## 📖 Complete Documentation
|
|
97
|
+
|
|
98
|
+
For comprehensive guides, testing architecture deep-dives, and complete API references, visit our documentation site:
|
|
99
|
+
👉 **[http://127.0.0.1:8000/](https://www.google.com/search?q=http://127.0.0.1:8000/)** *(Replace with your deployed docs URL, e.g., GitHub Pages)*
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## ⚖️ License
|
|
104
|
+
|
|
105
|
+
Distributed under the **MIT License**. Any modification or distribution (including forks) must include the original copyright notice and liability waiver. See `LICENSE` for more information.
|
|
106
|
+
|
|
107
|
+
```
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Data Engineering Experience 🚀
|
|
2
|
+
|
|
3
|
+
[](https://opensource.org/licenses/MIT)
|
|
4
|
+
[](https://www.python.org)
|
|
5
|
+
|
|
6
|
+
**dex** (*Data Engineering Experience*) is a minimalist, agnostic Python framework designed to streamline and standardize data engineering pipelines. By embracing **Convention over Configuration**, `dex` eliminates environment friction, absolute path hardcoding, and complex PySpark session management.
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## ✨ Key Features
|
|
11
|
+
|
|
12
|
+
* **Zero-Config File Discovery:** Automatic tree-walking directory resolution anchors your data catalog using your local `pyproject.toml` file.
|
|
13
|
+
* **Decentralized Catalog:** Declare your metadata layouts inside modular, self-contained mini-YAML files.
|
|
14
|
+
* **Elastic Processing Runtimes:** Switch dynamically between **Pandas** and **PySpark** execution engines using exactly the same unified interface.
|
|
15
|
+
* **Interactive CLI Scaffolding:** Spin up a new production-ready data directory structure instantly with `dex init`.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## 📦 Installation
|
|
20
|
+
|
|
21
|
+
*(Once published to PyPI)*
|
|
22
|
+
```bash
|
|
23
|
+
pip install dex
|
|
24
|
+
```
|
|
25
|
+
Or install it directly from the source repository using Poetry:
|
|
26
|
+
```bash
|
|
27
|
+
poetry add git+[https://github.com/idperez720/data-engineering-exp.git](https://github.com/idperez720/data-engineering-exp.git)
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## 🏁 Quick Start
|
|
33
|
+
|
|
34
|
+
### 1. Initialize your workspace
|
|
35
|
+
|
|
36
|
+
Navigate to an empty directory and let the interactive wizard scaffold the workspace conventions:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
dex init
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### 2. Declare a dataset
|
|
44
|
+
|
|
45
|
+
Add a specification block inside `conf/catalog/sample_dataset.yaml`:
|
|
46
|
+
|
|
47
|
+
```yaml
|
|
48
|
+
customers:
|
|
49
|
+
description: "Main production customer data"
|
|
50
|
+
format: "csv"
|
|
51
|
+
engine: "pandas"
|
|
52
|
+
storage_path: "data/sample_table.csv"
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### 3. Load data anywhere
|
|
57
|
+
|
|
58
|
+
Create a Python script or open a Jupyter Notebook inside `src/notebooks/` and fetch your data instantly:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from data_engineering_exp.core.io import DataLoader
|
|
62
|
+
|
|
63
|
+
# Autodiscovers your project root boundaries and settings
|
|
64
|
+
loader = DataLoader()
|
|
65
|
+
|
|
66
|
+
# Loads the dataset securely as a Pandas DataFrame
|
|
67
|
+
df = loader.load("customers")
|
|
68
|
+
df.head()
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## 📖 Complete Documentation
|
|
75
|
+
|
|
76
|
+
For comprehensive guides, testing architecture deep-dives, and complete API references, visit our documentation site:
|
|
77
|
+
👉 **[http://127.0.0.1:8000/](https://www.google.com/search?q=http://127.0.0.1:8000/)** *(Replace with your deployed docs URL, e.g., GitHub Pages)*
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## ⚖️ License
|
|
82
|
+
|
|
83
|
+
Distributed under the **MIT License**. Any modification or distribution (including forks) must include the original copyright notice and liability waiver. See `LICENSE` for more information.
|
|
84
|
+
|
|
85
|
+
```
|
|
File without changes
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Command Line Interface (CLI) entry point for dex using Click."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from data_engineering_exp.core.initializer import ProjectInitializer
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@click.group()
|
|
11
|
+
def entry_point() -> None:
|
|
12
|
+
"""dex - Data Engineering Experience CLI utilities.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
None: Main CLI entry group.
|
|
18
|
+
"""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@entry_point.command()
|
|
23
|
+
@click.option(
|
|
24
|
+
"--path",
|
|
25
|
+
type=click.Path(file_okay=False, dir_okay=True),
|
|
26
|
+
default=".",
|
|
27
|
+
help="Root directory where scaffolding will be built. Defaults to '.'",
|
|
28
|
+
)
|
|
29
|
+
def init(path: str) -> None:
|
|
30
|
+
"""Scaffold a new data engineering project structure interactively.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
path(str): Target directory string path for project structure.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
None: Side-effect creating folders and configurations.
|
|
37
|
+
"""
|
|
38
|
+
click.echo("🚀 Welcome to the dex project initialization wizard!\n")
|
|
39
|
+
|
|
40
|
+
# Derives default project name dynamically from the target directory name
|
|
41
|
+
default_name = os.path.basename(os.path.abspath(path))
|
|
42
|
+
if not default_name or default_name in (".", ""):
|
|
43
|
+
default_name = "my-dex-project"
|
|
44
|
+
|
|
45
|
+
# Sequential metadata prompts with default fallbacks
|
|
46
|
+
name = click.prompt("Project name", default=default_name, type=str)
|
|
47
|
+
version = click.prompt("Version", default="0.1.0", type=str)
|
|
48
|
+
description = click.prompt(
|
|
49
|
+
"Description",
|
|
50
|
+
default="Data engineering project scaffolded by dex",
|
|
51
|
+
type=str,
|
|
52
|
+
)
|
|
53
|
+
author = click.prompt("Author", default="Anonymous", type=str)
|
|
54
|
+
|
|
55
|
+
initializer = ProjectInitializer(base_path=path)
|
|
56
|
+
initializer.init_project(
|
|
57
|
+
name=name,
|
|
58
|
+
version=version,
|
|
59
|
+
description=description,
|
|
60
|
+
author=author,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
click.echo(f"\n✨ Project successfully initialized at '{path}'!")
|
|
File without changes
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""This module implements decentralized data catalog engines for dex."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Dict, List, Optional, cast
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DataCatalog:
|
|
10
|
+
"""Parses and consolidates declarative configurations from files or directories.
|
|
11
|
+
|
|
12
|
+
Scans catalog definitions recursively, allowing user architectures to be
|
|
13
|
+
split into mini-YAML files or kept within a single configuration layout.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, catalog_path: Optional[str] = None) -> None:
|
|
17
|
+
"""Initializes the DataCatalog by scanning files or configuration paths.
|
|
18
|
+
|
|
19
|
+
If no path is provided, it automatically seeks upwards from the current
|
|
20
|
+
working directory to find the standard 'conf/catalog' directory based on
|
|
21
|
+
the pyproject.toml root anchor file.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
catalog_path(Optional[str]): System directory path or file string
|
|
25
|
+
pointing to catalog definitions. Defaults to None.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
None: Initializes the class instance.
|
|
29
|
+
"""
|
|
30
|
+
self.project_root: str = ""
|
|
31
|
+
|
|
32
|
+
if catalog_path is None:
|
|
33
|
+
catalog_path = self._discover_catalog_path()
|
|
34
|
+
else:
|
|
35
|
+
# Derive the root directory context if an explicit path is supplied
|
|
36
|
+
abs_path = os.path.abspath(catalog_path)
|
|
37
|
+
if os.path.isfile(abs_path):
|
|
38
|
+
self.project_root = os.path.dirname(
|
|
39
|
+
os.path.dirname(os.path.dirname(abs_path))
|
|
40
|
+
)
|
|
41
|
+
else:
|
|
42
|
+
self.project_root = os.path.dirname(os.path.dirname(abs_path))
|
|
43
|
+
|
|
44
|
+
if not os.path.exists(catalog_path):
|
|
45
|
+
raise FileNotFoundError(f"Catalog source path not found at: {catalog_path}")
|
|
46
|
+
|
|
47
|
+
self._datasets: Dict[str, Dict[str, Any]] = {}
|
|
48
|
+
self._load_catalog_sources(catalog_path)
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def dataset_names(self) -> List[str]:
|
|
52
|
+
"""Exposes the list of all dataset identifiers present in the catalog.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
List[str]: List of string names for all loaded datasets.
|
|
58
|
+
"""
|
|
59
|
+
return list(self._datasets.keys())
|
|
60
|
+
|
|
61
|
+
def get_dataset_metadata(self, dataset_name: str) -> Dict[str, Any]:
|
|
62
|
+
"""Retrieves core metadata mapping configurations for a target dataset.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
dataset_name(str): Unique identifier string of the dataset.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Dict[str, Any]: Dictionary containing properties like description,
|
|
69
|
+
format, primary_keys, and storage_path.
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
KeyError: If the target dataset name is missing from the catalog.
|
|
73
|
+
"""
|
|
74
|
+
if dataset_name not in self._datasets:
|
|
75
|
+
raise KeyError(f"Dataset '{dataset_name}' missing from catalog.")
|
|
76
|
+
|
|
77
|
+
meta = self._datasets[dataset_name].copy()
|
|
78
|
+
meta.pop("columns", None)
|
|
79
|
+
return meta
|
|
80
|
+
|
|
81
|
+
def get_column_names(self, dataset_name: str) -> List[str]:
|
|
82
|
+
"""Extracts the expected list of column names for a target dataset.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
dataset_name(str): Unique identifier string of the dataset.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
List[str]: Ordered list of strings representing expected columns.
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
KeyError: If the target dataset name is missing from the catalog.
|
|
92
|
+
"""
|
|
93
|
+
if dataset_name not in self._datasets:
|
|
94
|
+
raise KeyError(f"Dataset '{dataset_name}' missing from catalog.")
|
|
95
|
+
|
|
96
|
+
columns_spec = self._datasets[dataset_name].get("columns", [])
|
|
97
|
+
return [col["name"] for col in columns_spec]
|
|
98
|
+
|
|
99
|
+
def validate_schema_presence(self, df: Any, dataset_name: str) -> bool:
|
|
100
|
+
"""Validates that all columns defined in the catalog exist in the dataframe.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
df(Any): Active input Pandas or PySpark DataFrame object.
|
|
104
|
+
dataset_name(str): Catalog identifier string of the dataset target.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
bool: True if all expected columns are present, raises ValueError
|
|
108
|
+
otherwise.
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
TypeError: If the input dataframe structure is not supported.
|
|
112
|
+
ValueError: If a column mismatch is discovered against the catalog.
|
|
113
|
+
"""
|
|
114
|
+
expected_cols = set(self.get_column_names(dataset_name))
|
|
115
|
+
|
|
116
|
+
df_type = type(df).__name__
|
|
117
|
+
if df_type == "DataFrame" and "pandas" in type(df).__module__:
|
|
118
|
+
actual_cols = set(df.columns.tolist())
|
|
119
|
+
elif df_type == "DataFrame" and "pyspark" in type(df).__module__:
|
|
120
|
+
actual_cols = set(df.columns)
|
|
121
|
+
else:
|
|
122
|
+
raise TypeError("Unsupported DataFrame type for validation.")
|
|
123
|
+
|
|
124
|
+
missing_cols = expected_cols - actual_cols
|
|
125
|
+
if missing_cols:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"Schema mismatch for '{dataset_name}'. "
|
|
128
|
+
f"Missing expected catalog columns: {list(missing_cols)}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return True
|
|
132
|
+
|
|
133
|
+
def _discover_catalog_path(self) -> str:
|
|
134
|
+
"""Traverses parent directories upwards to locate the pyproject.toml file.
|
|
135
|
+
|
|
136
|
+
Once the layout anchor file is found, it automatically appends the standard
|
|
137
|
+
'conf/catalog' structural workspace mapping.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
str: Absolute path to the discovered catalog directory.
|
|
141
|
+
|
|
142
|
+
Raises:
|
|
143
|
+
FileNotFoundError: If the pyproject.toml configuration cannot be found.
|
|
144
|
+
"""
|
|
145
|
+
current_dir = os.getcwd()
|
|
146
|
+
|
|
147
|
+
while True:
|
|
148
|
+
potential_toml = os.path.join(current_dir, "pyproject.toml")
|
|
149
|
+
if os.path.exists(potential_toml):
|
|
150
|
+
self.project_root = current_dir
|
|
151
|
+
return os.path.join(current_dir, "conf", "catalog")
|
|
152
|
+
|
|
153
|
+
parent_dir = os.path.dirname(current_dir)
|
|
154
|
+
if parent_dir == current_dir:
|
|
155
|
+
break
|
|
156
|
+
current_dir = parent_dir
|
|
157
|
+
|
|
158
|
+
raise FileNotFoundError(
|
|
159
|
+
"Configuration file (pyproject.toml) could not be located. "
|
|
160
|
+
"Please run 'dex init' to establish a valid project layout."
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _load_catalog_sources(self, path: str) -> None:
|
|
164
|
+
"""Internal worker to process and parse path targets recursively.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
path(str): Targeting file path or folder configuration directory.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
None: Populates internal dictionary instances.
|
|
171
|
+
"""
|
|
172
|
+
if os.path.isfile(path):
|
|
173
|
+
if path.endswith((".yml", ".yaml")):
|
|
174
|
+
self._parse_file(path)
|
|
175
|
+
return
|
|
176
|
+
|
|
177
|
+
for root, _, files in os.walk(path):
|
|
178
|
+
for file in files:
|
|
179
|
+
if file.endswith((".yml", ".yaml")):
|
|
180
|
+
full_path = os.path.join(root, file)
|
|
181
|
+
self._parse_file(full_path)
|
|
182
|
+
|
|
183
|
+
def _parse_file(self, file_path: str) -> None:
|
|
184
|
+
"""Parses an individual YAML catalog file and updates target states.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
file_path(str): Exact system string location pointing to file.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
None: Updates the core datasets dictionary data states.
|
|
191
|
+
"""
|
|
192
|
+
with open(file_path, "r", encoding="utf-8") as stream:
|
|
193
|
+
content = yaml.safe_load(stream)
|
|
194
|
+
|
|
195
|
+
if content and isinstance(content, dict):
|
|
196
|
+
typed_content = cast(Dict[str, Dict[str, Any]], content)
|
|
197
|
+
self._datasets.update(typed_content)
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""This module implements versatile data deduplication utilities for dex."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
# Handle optional dependencies at the top of the file
|
|
6
|
+
try:
|
|
7
|
+
import pyspark.sql.functions as F
|
|
8
|
+
from pyspark.sql import Window
|
|
9
|
+
|
|
10
|
+
HAS_SPARK = True
|
|
11
|
+
except ImportError:
|
|
12
|
+
Window: Any = None
|
|
13
|
+
F: Any = None
|
|
14
|
+
HAS_SPARK = False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Deduplicator:
|
|
18
|
+
"""Utility class providing multiple strategies for data deduplication.
|
|
19
|
+
|
|
20
|
+
Supports Pandas and PySpark DataFrames dynamically, allowing extraction of
|
|
21
|
+
first, latest, multi-sorted, consolidated, or strictly unique records.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self):
|
|
25
|
+
"""Initializes the Deduplicator instance.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
None: Initializes the class instance.
|
|
31
|
+
"""
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
def latest(self, df: Any, keys: List[str], order_by_col: str) -> Any:
|
|
35
|
+
"""Extracts the most recent record per business key group.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
df(Any): Input Pandas or PySpark DataFrame.
|
|
39
|
+
keys(List[str]): Columns that identify a business entity.
|
|
40
|
+
order_by_col(str): Column used to determine chronological order.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Any: Deduplicated DataFrame containing only the latest events.
|
|
44
|
+
"""
|
|
45
|
+
framework = self._detect_framework(df)
|
|
46
|
+
if framework == "pandas":
|
|
47
|
+
return (
|
|
48
|
+
df.sort_values(by=order_by_col, ascending=True)
|
|
49
|
+
.groupby(keys)
|
|
50
|
+
.last()
|
|
51
|
+
.reset_index()
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
window_spec = Window.partitionBy(keys).orderBy(F.col(order_by_col).desc())
|
|
55
|
+
return (
|
|
56
|
+
df.withColumn("_row_num", F.row_number().over(window_spec))
|
|
57
|
+
.filter(F.col("_row_num") == 1)
|
|
58
|
+
.drop("_row_num")
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def first(self, df: Any, keys: List[str], order_by_col: str) -> Any:
|
|
62
|
+
"""Extracts the earliest chronological record per business key group.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
df(Any): Input Pandas or PySpark DataFrame.
|
|
66
|
+
keys(List[str]): Columns that identify a business entity.
|
|
67
|
+
order_by_col(str): Column used to determine chronological order.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Any: Deduplicated DataFrame containing only the first events.
|
|
71
|
+
"""
|
|
72
|
+
framework = self._detect_framework(df)
|
|
73
|
+
if framework == "pandas":
|
|
74
|
+
return (
|
|
75
|
+
df.sort_values(by=order_by_col, ascending=True)
|
|
76
|
+
.groupby(keys)
|
|
77
|
+
.first()
|
|
78
|
+
.reset_index()
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
window_spec = Window.partitionBy(keys).orderBy(F.col(order_by_col).asc())
|
|
82
|
+
return (
|
|
83
|
+
df.withColumn("_row_num", F.row_number().over(window_spec))
|
|
84
|
+
.filter(F.col("_row_num") == 1)
|
|
85
|
+
.drop("_row_num")
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def distinct(self, df: Any, subset: Optional[List[str]] = None) -> Any:
|
|
89
|
+
"""Removes strict duplicate rows from the DataFrame.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
df(Any): Input Pandas or PySpark DataFrame.
|
|
93
|
+
subset(Optional[List[str]]): Specific columns to consider when
|
|
94
|
+
identifying duplicates. Defaults to None (all columns).
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Any: Clean DataFrame with unique records only.
|
|
98
|
+
"""
|
|
99
|
+
framework = self._detect_framework(df)
|
|
100
|
+
if framework == "pandas":
|
|
101
|
+
return df.drop_duplicates(subset=subset).reset_index(drop=True)
|
|
102
|
+
|
|
103
|
+
return df.dropDuplicates(subset=subset)
|
|
104
|
+
|
|
105
|
+
def by_order(
|
|
106
|
+
self,
|
|
107
|
+
df: Any,
|
|
108
|
+
keys: List[str],
|
|
109
|
+
order_by_cols: List[str],
|
|
110
|
+
ascending: Union[bool, List[bool]] = True,
|
|
111
|
+
) -> Any:
|
|
112
|
+
"""Deduplicates rows by sorting through multiple custom column criteria.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
df(Any): Input Pandas or PySpark DataFrame.
|
|
116
|
+
keys(List[str]): Columns that identify a business entity.
|
|
117
|
+
order_by_cols(List[str]): Ordered list of columns to sort by.
|
|
118
|
+
ascending(Union[bool, List[bool]]): Type of sorting direction for
|
|
119
|
+
each column. Defaults to True.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Any: Deduplicated DataFrame containing the top-ranked records.
|
|
123
|
+
"""
|
|
124
|
+
framework = self._detect_framework(df)
|
|
125
|
+
if framework == "pandas":
|
|
126
|
+
return (
|
|
127
|
+
df.sort_values(by=order_by_cols, ascending=ascending)
|
|
128
|
+
.groupby(keys)
|
|
129
|
+
.first()
|
|
130
|
+
.reset_index()
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
if isinstance(ascending, bool):
|
|
134
|
+
asc_list = [ascending] * len(order_by_cols)
|
|
135
|
+
else:
|
|
136
|
+
asc_list = ascending
|
|
137
|
+
|
|
138
|
+
order_exprs = []
|
|
139
|
+
for col, asc_dir in zip(order_by_cols, asc_list):
|
|
140
|
+
expr = F.col(col).asc() if asc_dir else F.col(col).desc()
|
|
141
|
+
order_exprs.append(expr)
|
|
142
|
+
|
|
143
|
+
window_spec = Window.partitionBy(keys).orderBy(*order_exprs)
|
|
144
|
+
return (
|
|
145
|
+
df.withColumn("_row_num", F.row_number().over(window_spec))
|
|
146
|
+
.filter(F.col("_row_num") == 1)
|
|
147
|
+
.drop("_row_num")
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def combined(self, df: Any, keys: List[str]) -> Any:
|
|
151
|
+
"""Stitches rows together by compacting nulls into a golden record.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
df(Any): Input Pandas or PySpark DataFrame.
|
|
155
|
+
keys(List[str]): Columns that identify a business entity.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Any: Consolidated DataFrame with filled attributes per group.
|
|
159
|
+
"""
|
|
160
|
+
framework = self._detect_framework(df)
|
|
161
|
+
if framework == "pandas":
|
|
162
|
+
return df.groupby(keys).first().reset_index()
|
|
163
|
+
|
|
164
|
+
agg_cols = [
|
|
165
|
+
F.first(c, ignorenulls=True).alias(c) for c in df.columns if c not in keys
|
|
166
|
+
]
|
|
167
|
+
return df.groupBy(keys).agg(*agg_cols)
|
|
168
|
+
|
|
169
|
+
def _detect_framework(self, df: Any) -> str:
|
|
170
|
+
"""Internal helper to identify the dataframe framework type safely.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
df(Any): Input dataframe object.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
str: String identifier ("pandas" or "spark").
|
|
177
|
+
"""
|
|
178
|
+
df_type = type(df).__name__
|
|
179
|
+
module_type = type(df).__module__
|
|
180
|
+
|
|
181
|
+
if "pandas" in module_type and df_type == "DataFrame":
|
|
182
|
+
return "pandas"
|
|
183
|
+
|
|
184
|
+
if "pyspark" in module_type and df_type == "DataFrame":
|
|
185
|
+
if not HAS_SPARK:
|
|
186
|
+
raise ImportError("PySpark is required but could not be imported.")
|
|
187
|
+
return "spark"
|
|
188
|
+
|
|
189
|
+
raise TypeError(
|
|
190
|
+
f"Unsupported type: {type(df)}. "
|
|
191
|
+
"Only Pandas and PySpark DataFrames are supported."
|
|
192
|
+
)
|