sagaranalysis 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sagaranalysis-0.1.0/LICENSE +21 -0
- sagaranalysis-0.1.0/PKG-INFO +79 -0
- sagaranalysis-0.1.0/README.md +53 -0
- sagaranalysis-0.1.0/pyproject.toml +36 -0
- sagaranalysis-0.1.0/sagaranalysis/__init__.py +125 -0
- sagaranalysis-0.1.0/sagaranalysis/cleaning/__init__.py +22 -0
- sagaranalysis-0.1.0/sagaranalysis/cleaning/base.py +23 -0
- sagaranalysis-0.1.0/sagaranalysis/cleaning/pipeline.py +118 -0
- sagaranalysis-0.1.0/sagaranalysis/cleaning/rules.py +314 -0
- sagaranalysis-0.1.0/sagaranalysis/eda/__init__.py +7 -0
- sagaranalysis-0.1.0/sagaranalysis/eda/analyzer.py +195 -0
- sagaranalysis-0.1.0/sagaranalysis/eda/insights.py +195 -0
- sagaranalysis-0.1.0/sagaranalysis/report/__init__.py +3 -0
- sagaranalysis-0.1.0/sagaranalysis/report/model.py +114 -0
- sagaranalysis-0.1.0/sagaranalysis/report/templates.py +1102 -0
- sagaranalysis-0.1.0/sagaranalysis/report/terminal.py +219 -0
- sagaranalysis-0.1.0/sagaranalysis/visualization/__init__.py +15 -0
- sagaranalysis-0.1.0/sagaranalysis/visualization/plots.py +290 -0
- sagaranalysis-0.1.0/sagaranalysis.egg-info/PKG-INFO +79 -0
- sagaranalysis-0.1.0/sagaranalysis.egg-info/SOURCES.txt +25 -0
- sagaranalysis-0.1.0/sagaranalysis.egg-info/dependency_links.txt +1 -0
- sagaranalysis-0.1.0/sagaranalysis.egg-info/requires.txt +11 -0
- sagaranalysis-0.1.0/sagaranalysis.egg-info/top_level.txt +1 -0
- sagaranalysis-0.1.0/setup.cfg +4 -0
- sagaranalysis-0.1.0/tests/test_cleaning.py +142 -0
- sagaranalysis-0.1.0/tests/test_eda.py +93 -0
- sagaranalysis-0.1.0/tests/test_visualization.py +61 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Sagar
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sagaranalysis
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A production-ready Python library for one-line data cleaning, EDA, and beautiful HTML reporting.
|
|
5
|
+
Author-email: Sagar <sagarmohite4895@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: pandas>=1.3.0
|
|
16
|
+
Requires-Dist: numpy>=1.20.0
|
|
17
|
+
Requires-Dist: matplotlib>=3.4.0
|
|
18
|
+
Requires-Dist: seaborn>=0.11.0
|
|
19
|
+
Requires-Dist: jinja2>=3.0.0
|
|
20
|
+
Requires-Dist: scipy>=1.7.0
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
23
|
+
Requires-Dist: black; extra == "dev"
|
|
24
|
+
Requires-Dist: ruff; extra == "dev"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# SagarAnalysis
|
|
28
|
+
|
|
29
|
+
A production-ready, beginner-friendly Python package designed for one-line data cleaning, one-line Exploratory Data Analysis (EDA), and stunning, glassmorphic HTML reporting.
|
|
30
|
+
|
|
31
|
+
## Features
|
|
32
|
+
|
|
33
|
+
- **One-line Cleaning (`clean(df)`)**:
|
|
34
|
+
- Automatically standardizes column names to `snake_case`.
|
|
35
|
+
- Removes duplicate rows.
|
|
36
|
+
- Detects and drops constant/highly missing columns.
|
|
37
|
+
- Imputes missing values intelligently (median/mean/mode/unknown).
|
|
38
|
+
- Handles numerical outliers via clipping.
|
|
39
|
+
- Coerces data types (string to numeric, string to datetime, Yes/No to boolean).
|
|
40
|
+
- **One-line EDA & Report (`analysis(df)`)**:
|
|
41
|
+
- Generates a comprehensive overview of the dataset.
|
|
42
|
+
- Performs descriptive statistics and missingness analysis.
|
|
43
|
+
- Discovers high correlations and skewness.
|
|
44
|
+
- Auto-detects the target column and runs target profiling.
|
|
45
|
+
- **Stunning HTML Reports (`report.save()`)**:
|
|
46
|
+
- Interactive, modern Glassmorphism design.
|
|
47
|
+
- Supports light/dark theme toggles.
|
|
48
|
+
- Fully responsive layout with embedded Base64 charts.
|
|
49
|
+
- Searchable variables, collapsible details, and clear data insights.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install sagaranalysis
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Quick Start
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import pandas as pd
|
|
61
|
+
import sagaranalysis as sa
|
|
62
|
+
|
|
63
|
+
# Load your dirty dataset
|
|
64
|
+
df = pd.read_csv("dirty_data.csv")
|
|
65
|
+
|
|
66
|
+
# Clean your data in one line
|
|
67
|
+
cleaned_df = sa.clean(df)
|
|
68
|
+
|
|
69
|
+
# Perform automatic EDA and generate insights
|
|
70
|
+
report = sa.analysis(cleaned_df)
|
|
71
|
+
|
|
72
|
+
# View or save the report
|
|
73
|
+
report.show() # Opens in local browser or renders in Jupyter Notebook
|
|
74
|
+
report.save("report.html") # Saves self-contained report to disk
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## License
|
|
78
|
+
|
|
79
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# SagarAnalysis
|
|
2
|
+
|
|
3
|
+
A production-ready, beginner-friendly Python package designed for one-line data cleaning, one-line Exploratory Data Analysis (EDA), and stunning, glassmorphic HTML reporting.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **One-line Cleaning (`clean(df)`)**:
|
|
8
|
+
- Automatically standardizes column names to `snake_case`.
|
|
9
|
+
- Removes duplicate rows.
|
|
10
|
+
- Detects and drops constant/highly missing columns.
|
|
11
|
+
- Imputes missing values intelligently (median/mean/mode/unknown).
|
|
12
|
+
- Handles numerical outliers via clipping.
|
|
13
|
+
- Coerces data types (string to numeric, string to datetime, Yes/No to boolean).
|
|
14
|
+
- **One-line EDA & Report (`analysis(df)`)**:
|
|
15
|
+
- Generates a comprehensive overview of the dataset.
|
|
16
|
+
- Performs descriptive statistics and missingness analysis.
|
|
17
|
+
- Discovers high correlations and skewness.
|
|
18
|
+
- Auto-detects the target column and runs target profiling.
|
|
19
|
+
- **Stunning HTML Reports (`report.save()`)**:
|
|
20
|
+
- Interactive, modern Glassmorphism design.
|
|
21
|
+
- Supports light/dark theme toggles.
|
|
22
|
+
- Fully responsive layout with embedded Base64 charts.
|
|
23
|
+
- Searchable variables, collapsible details, and clear data insights.
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install sagaranalysis
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Quick Start
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import pandas as pd
|
|
35
|
+
import sagaranalysis as sa
|
|
36
|
+
|
|
37
|
+
# Load your dirty dataset
|
|
38
|
+
df = pd.read_csv("dirty_data.csv")
|
|
39
|
+
|
|
40
|
+
# Clean your data in one line
|
|
41
|
+
cleaned_df = sa.clean(df)
|
|
42
|
+
|
|
43
|
+
# Perform automatic EDA and generate insights
|
|
44
|
+
report = sa.analysis(cleaned_df)
|
|
45
|
+
|
|
46
|
+
# View or save the report
|
|
47
|
+
report.show() # Opens in local browser or renders in Jupyter Notebook
|
|
48
|
+
report.save("report.html") # Saves self-contained report to disk
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## License
|
|
52
|
+
|
|
53
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sagaranalysis"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Sagar", email = "sagarmohite4895@gmail.com" }
|
|
10
|
+
]
|
|
11
|
+
description = "A production-ready Python library for one-line data cleaning, EDA, and beautiful HTML reporting."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
license = { text = "MIT" }
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"pandas>=1.3.0",
|
|
24
|
+
"numpy>=1.20.0",
|
|
25
|
+
"matplotlib>=3.4.0",
|
|
26
|
+
"seaborn>=0.11.0",
|
|
27
|
+
"jinja2>=3.0.0",
|
|
28
|
+
"scipy>=1.7.0"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
dev = [
|
|
33
|
+
"pytest>=7.0.0",
|
|
34
|
+
"black",
|
|
35
|
+
"ruff"
|
|
36
|
+
]
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional, Dict
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
# Setup package logging
|
|
6
|
+
logging.getLogger("sagaranalysis").addHandler(logging.NullHandler())
|
|
7
|
+
|
|
8
|
+
from sagaranalysis.cleaning import (
|
|
9
|
+
CleaningPipeline,
|
|
10
|
+
ColumnStandardizer,
|
|
11
|
+
DuplicateRemover,
|
|
12
|
+
ColumnDropper,
|
|
13
|
+
TypeCoercer,
|
|
14
|
+
MissingImputer,
|
|
15
|
+
OutlierHandler,
|
|
16
|
+
CleaningSummary
|
|
17
|
+
)
|
|
18
|
+
from sagaranalysis.eda import DatasetAnalyzer
|
|
19
|
+
from sagaranalysis.report import Report
|
|
20
|
+
from sagaranalysis.visualization import (
|
|
21
|
+
plot_missing_values,
|
|
22
|
+
plot_numerical_distribution,
|
|
23
|
+
plot_categorical_frequency,
|
|
24
|
+
plot_correlation_heatmap,
|
|
25
|
+
plot_target_relationship
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def clean(
|
|
29
|
+
df: pd.DataFrame,
|
|
30
|
+
missing_threshold: float = 0.60,
|
|
31
|
+
constant_threshold: float = 0.99,
|
|
32
|
+
outlier_multiplier: float = 1.5
|
|
33
|
+
) -> pd.DataFrame:
|
|
34
|
+
"""
|
|
35
|
+
Cleans a pandas DataFrame in one line.
|
|
36
|
+
|
|
37
|
+
Standardizes column names, removes duplicates, drops constant or highly missing
|
|
38
|
+
columns, coerces types (dates, numbers-as-strings, booleans), imputes missing values
|
|
39
|
+
intelligently, and clips outliers.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
df: The pandas DataFrame to clean.
|
|
43
|
+
missing_threshold: Percentage of null values above which a column is dropped (default 60%).
|
|
44
|
+
constant_threshold: Percentage of duplicate values above which a column is dropped (default 99%).
|
|
45
|
+
outlier_multiplier: IQR multiplier for clipping outliers (default 1.5).
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
A cleaned copy of the DataFrame with `attrs['cleaning_summary']` set to a `CleaningSummary` object.
|
|
49
|
+
"""
|
|
50
|
+
pipeline = CleaningPipeline()
|
|
51
|
+
pipeline.add_step(ColumnStandardizer())
|
|
52
|
+
pipeline.add_step(DuplicateRemover())
|
|
53
|
+
pipeline.add_step(ColumnDropper(missing_threshold, constant_threshold))
|
|
54
|
+
pipeline.add_step(TypeCoercer())
|
|
55
|
+
pipeline.add_step(MissingImputer())
|
|
56
|
+
pipeline.add_step(OutlierHandler(outlier_multiplier))
|
|
57
|
+
|
|
58
|
+
cleaned_df, summary = pipeline.run(df)
|
|
59
|
+
return cleaned_df
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def analysis(df: pd.DataFrame, target: Optional[str] = None) -> Report:
|
|
63
|
+
"""
|
|
64
|
+
Analyzes a pandas DataFrame in one line and returns an interactive Report object.
|
|
65
|
+
|
|
66
|
+
Extracts descriptive statistics, profiles each column, generates base64-encoded
|
|
67
|
+
visualizations, and runs automated insight checks.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
df: The pandas DataFrame to analyze.
|
|
71
|
+
target: Optional target column name. If not specified, attempts to auto-detect.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
A Report object ready to show, save, summarize, or expose insights.
|
|
75
|
+
"""
|
|
76
|
+
from halo import Halo
|
|
77
|
+
spinner = Halo(text="Analyzing dataset features...", spinner="dots", color="magenta")
|
|
78
|
+
spinner.start()
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# 1. Perform calculations
|
|
82
|
+
analyzer = DatasetAnalyzer(df, target=target)
|
|
83
|
+
results = analyzer.analyze()
|
|
84
|
+
|
|
85
|
+
detected_target = results["overview"]["target_detected"]
|
|
86
|
+
detected_target_type = results["target"]["type"] if results["target"] else None
|
|
87
|
+
|
|
88
|
+
# 2. Generate Base64 Visualizations
|
|
89
|
+
plots: Dict[str, str] = {}
|
|
90
|
+
|
|
91
|
+
# Missing values plot (returns None if no missing values)
|
|
92
|
+
missing_plot = plot_missing_values(df)
|
|
93
|
+
if missing_plot:
|
|
94
|
+
plots["missingness"] = missing_plot
|
|
95
|
+
|
|
96
|
+
# Column distributions
|
|
97
|
+
for col_name, col_meta in results["columns"].items():
|
|
98
|
+
series = df[col_name]
|
|
99
|
+
if col_meta["type"] == "numeric":
|
|
100
|
+
plots[col_name] = plot_numerical_distribution(series, col_name)
|
|
101
|
+
elif col_meta["type"] == "categorical":
|
|
102
|
+
plots[col_name] = plot_categorical_frequency(series, col_name)
|
|
103
|
+
|
|
104
|
+
# Correlation heatmap
|
|
105
|
+
corr_heatmap = plot_correlation_heatmap(df)
|
|
106
|
+
if corr_heatmap:
|
|
107
|
+
plots["correlation"] = corr_heatmap
|
|
108
|
+
|
|
109
|
+
# Target relationships
|
|
110
|
+
if detected_target and results["target"]:
|
|
111
|
+
relationships = results["target"].get("relationships", {})
|
|
112
|
+
for feat in relationships.keys():
|
|
113
|
+
plots[f"target_{feat}"] = plot_target_relationship(
|
|
114
|
+
df, feat, detected_target, detected_target_type
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# 3. Retrieve any cleaning summary attached to the DataFrame
|
|
118
|
+
cleaning_summary = df.attrs.get("cleaning_summary")
|
|
119
|
+
|
|
120
|
+
spinner.succeed("Statistical profiling and visual analytics complete!")
|
|
121
|
+
# 4. Construct Report
|
|
122
|
+
return Report(results, plots, cleaning_summary=cleaning_summary)
|
|
123
|
+
except Exception as e:
|
|
124
|
+
spinner.fail(f"Analysis failed: {str(e)}")
|
|
125
|
+
raise e
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from sagaranalysis.cleaning.base import CleaningStep
|
|
2
|
+
from sagaranalysis.cleaning.pipeline import CleaningPipeline, CleaningSummary
|
|
3
|
+
from sagaranalysis.cleaning.rules import (
|
|
4
|
+
ColumnStandardizer,
|
|
5
|
+
DuplicateRemover,
|
|
6
|
+
ColumnDropper,
|
|
7
|
+
TypeCoercer,
|
|
8
|
+
MissingImputer,
|
|
9
|
+
OutlierHandler
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"CleaningStep",
|
|
14
|
+
"CleaningPipeline",
|
|
15
|
+
"CleaningSummary",
|
|
16
|
+
"ColumnStandardizer",
|
|
17
|
+
"DuplicateRemover",
|
|
18
|
+
"ColumnDropper",
|
|
19
|
+
"TypeCoercer",
|
|
20
|
+
"MissingImputer",
|
|
21
|
+
"OutlierHandler"
|
|
22
|
+
]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Tuple, List
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
class CleaningStep(ABC):
|
|
6
|
+
"""
|
|
7
|
+
Abstract base class for a single data cleaning step.
|
|
8
|
+
Each step must implement `execute` and return a cleaned copy/view
|
|
9
|
+
of the DataFrame along with a list of logs describing what was changed.
|
|
10
|
+
"""
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def execute(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
|
|
13
|
+
"""
|
|
14
|
+
Executes the cleaning operation.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
df: The input pandas DataFrame.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
A tuple of (cleaned_df, logs_list) where logs_list contains
|
|
21
|
+
messages detailing changes.
|
|
22
|
+
"""
|
|
23
|
+
pass
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import logging
|
|
3
|
+
from typing import List, Tuple
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from sagaranalysis.cleaning.base import CleaningStep
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger("sagaranalysis.cleaning")
|
|
8
|
+
|
|
9
|
+
class CleaningSummary:
|
|
10
|
+
"""
|
|
11
|
+
Holds metadata and logs about a finished data cleaning session.
|
|
12
|
+
"""
|
|
13
|
+
def __init__(self, initial_shape: Tuple[int, int]):
|
|
14
|
+
self.initial_shape = initial_shape
|
|
15
|
+
self.final_shape = initial_shape
|
|
16
|
+
self.logs: List[str] = []
|
|
17
|
+
self.execution_time_seconds: float = 0.0
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def rows_removed(self) -> int:
|
|
21
|
+
return self.initial_shape[0] - self.final_shape[0]
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def cols_removed(self) -> int:
|
|
25
|
+
return self.initial_shape[1] - self.final_shape[1]
|
|
26
|
+
|
|
27
|
+
def add_logs(self, step_logs: List[str]) -> None:
|
|
28
|
+
self.logs.extend(step_logs)
|
|
29
|
+
|
|
30
|
+
def report(self) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Generates a clean markdown string detailing the cleaning summary.
|
|
33
|
+
"""
|
|
34
|
+
report_lines = [
|
|
35
|
+
"# Data Cleaning Summary Report",
|
|
36
|
+
f"- **Execution Time:** {self.execution_time_seconds:.4f} seconds",
|
|
37
|
+
f"- **Initial Shape:** {self.initial_shape[0]} rows, {self.initial_shape[1]} columns",
|
|
38
|
+
f"- **Final Shape:** {self.final_shape[0]} rows, {self.final_shape[1]} columns",
|
|
39
|
+
f"- **Rows Removed:** {self.rows_removed}",
|
|
40
|
+
f"- **Columns Removed:** {self.cols_removed}",
|
|
41
|
+
"\n## Operations Applied:"
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
if self.logs:
|
|
45
|
+
for log in self.logs:
|
|
46
|
+
report_lines.append(f"- {log}")
|
|
47
|
+
else:
|
|
48
|
+
report_lines.append("- No operations performed.")
|
|
49
|
+
|
|
50
|
+
return "\n".join(report_lines)
|
|
51
|
+
|
|
52
|
+
def __str__(self) -> str:
|
|
53
|
+
return self.report()
|
|
54
|
+
|
|
55
|
+
def __repr__(self) -> str:
|
|
56
|
+
return f"<CleaningSummary initial={self.initial_shape} final={self.final_shape} logs={len(self.logs)}>"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class CleaningPipeline:
|
|
60
|
+
"""
|
|
61
|
+
Orchestrates the execution of multiple CleaningSteps.
|
|
62
|
+
"""
|
|
63
|
+
def __init__(self):
|
|
64
|
+
self.steps: List[CleaningStep] = []
|
|
65
|
+
|
|
66
|
+
def add_step(self, step: CleaningStep) -> None:
|
|
67
|
+
"""
|
|
68
|
+
Appends a cleaning step to the pipeline.
|
|
69
|
+
"""
|
|
70
|
+
self.steps.append(step)
|
|
71
|
+
|
|
72
|
+
def run(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningSummary]:
|
|
73
|
+
"""
|
|
74
|
+
Runs all cleaning steps in the pipeline sequentially.
|
|
75
|
+
"""
|
|
76
|
+
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn
|
|
77
|
+
from halo import Halo
|
|
78
|
+
|
|
79
|
+
start_time = time.time()
|
|
80
|
+
summary = CleaningSummary(df.shape)
|
|
81
|
+
|
|
82
|
+
cleaned_df = df.copy()
|
|
83
|
+
|
|
84
|
+
# Display rich progress bar
|
|
85
|
+
with Progress(
|
|
86
|
+
SpinnerColumn(spinner_name="dots", style="cyan"),
|
|
87
|
+
TextColumn("[cyan]{task.description}"),
|
|
88
|
+
BarColumn(bar_width=30, style="dim cyan", complete_style="cyan"),
|
|
89
|
+
TextColumn("[cyan]{task.percentage:>3.0f}%"),
|
|
90
|
+
transient=True
|
|
91
|
+
) as progress:
|
|
92
|
+
task = progress.add_task("Executing cleaning pipeline...", total=len(self.steps))
|
|
93
|
+
for i, step in enumerate(self.steps):
|
|
94
|
+
step_name = step.__class__.__name__
|
|
95
|
+
progress.update(task, description=f"Step {i+1}/{len(self.steps)}: {step_name}")
|
|
96
|
+
try:
|
|
97
|
+
cleaned_df, step_logs = step.execute(cleaned_df)
|
|
98
|
+
summary.add_logs(step_logs)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.error(f"Error in cleaning step {step_name}: {e}", exc_info=True)
|
|
101
|
+
summary.add_logs([f"ERROR in {step_name}: {str(e)}"])
|
|
102
|
+
|
|
103
|
+
# Introduce a small artificial delay so that progress updates are readable to users
|
|
104
|
+
time.sleep(0.05)
|
|
105
|
+
progress.advance(task, 1)
|
|
106
|
+
|
|
107
|
+
# Show final success spinner
|
|
108
|
+
success_spinner = Halo(text="Dataset cleaned successfully!", spinner="dots", color="cyan")
|
|
109
|
+
success_spinner.start()
|
|
110
|
+
success_spinner.succeed()
|
|
111
|
+
|
|
112
|
+
summary.final_shape = cleaned_df.shape
|
|
113
|
+
summary.execution_time_seconds = time.time() - start_time
|
|
114
|
+
|
|
115
|
+
# Attach summary to the DataFrame attributes metadata
|
|
116
|
+
cleaned_df.attrs["cleaning_summary"] = summary
|
|
117
|
+
|
|
118
|
+
return cleaned_df, summary
|