sagaranalysis 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. sagaranalysis-0.1.0/LICENSE +21 -0
  2. sagaranalysis-0.1.0/PKG-INFO +79 -0
  3. sagaranalysis-0.1.0/README.md +53 -0
  4. sagaranalysis-0.1.0/pyproject.toml +36 -0
  5. sagaranalysis-0.1.0/sagaranalysis/__init__.py +125 -0
  6. sagaranalysis-0.1.0/sagaranalysis/cleaning/__init__.py +22 -0
  7. sagaranalysis-0.1.0/sagaranalysis/cleaning/base.py +23 -0
  8. sagaranalysis-0.1.0/sagaranalysis/cleaning/pipeline.py +118 -0
  9. sagaranalysis-0.1.0/sagaranalysis/cleaning/rules.py +314 -0
  10. sagaranalysis-0.1.0/sagaranalysis/eda/__init__.py +7 -0
  11. sagaranalysis-0.1.0/sagaranalysis/eda/analyzer.py +195 -0
  12. sagaranalysis-0.1.0/sagaranalysis/eda/insights.py +195 -0
  13. sagaranalysis-0.1.0/sagaranalysis/report/__init__.py +3 -0
  14. sagaranalysis-0.1.0/sagaranalysis/report/model.py +114 -0
  15. sagaranalysis-0.1.0/sagaranalysis/report/templates.py +1102 -0
  16. sagaranalysis-0.1.0/sagaranalysis/report/terminal.py +219 -0
  17. sagaranalysis-0.1.0/sagaranalysis/visualization/__init__.py +15 -0
  18. sagaranalysis-0.1.0/sagaranalysis/visualization/plots.py +290 -0
  19. sagaranalysis-0.1.0/sagaranalysis.egg-info/PKG-INFO +79 -0
  20. sagaranalysis-0.1.0/sagaranalysis.egg-info/SOURCES.txt +25 -0
  21. sagaranalysis-0.1.0/sagaranalysis.egg-info/dependency_links.txt +1 -0
  22. sagaranalysis-0.1.0/sagaranalysis.egg-info/requires.txt +11 -0
  23. sagaranalysis-0.1.0/sagaranalysis.egg-info/top_level.txt +1 -0
  24. sagaranalysis-0.1.0/setup.cfg +4 -0
  25. sagaranalysis-0.1.0/tests/test_cleaning.py +142 -0
  26. sagaranalysis-0.1.0/tests/test_eda.py +93 -0
  27. sagaranalysis-0.1.0/tests/test_visualization.py +61 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Sagar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.4
2
+ Name: sagaranalysis
3
+ Version: 0.1.0
4
+ Summary: A production-ready Python library for one-line data cleaning, EDA, and beautiful HTML reporting.
5
+ Author-email: Sagar <sagarmohite4895@gmail.com>
6
+ License: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: pandas>=1.3.0
16
+ Requires-Dist: numpy>=1.20.0
17
+ Requires-Dist: matplotlib>=3.4.0
18
+ Requires-Dist: seaborn>=0.11.0
19
+ Requires-Dist: jinja2>=3.0.0
20
+ Requires-Dist: scipy>=1.7.0
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
23
+ Requires-Dist: black; extra == "dev"
24
+ Requires-Dist: ruff; extra == "dev"
25
+ Dynamic: license-file
26
+
27
+ # SagarAnalysis
28
+
29
+ A production-ready, beginner-friendly Python package designed for one-line data cleaning, one-line Exploratory Data Analysis (EDA), and stunning, glassmorphic HTML reporting.
30
+
31
+ ## Features
32
+
33
+ - **One-line Cleaning (`clean(df)`)**:
34
+ - Automatically standardizes column names to `snake_case`.
35
+ - Removes duplicate rows.
36
+ - Detects and drops constant/highly missing columns.
37
+ - Imputes missing values intelligently (median/mean/mode/unknown).
38
+ - Handles numerical outliers via clipping.
39
+ - Coerces data types (string to numeric, string to datetime, Yes/No to boolean).
40
+ - **One-line EDA & Report (`analysis(df)`)**:
41
+ - Generates a comprehensive overview of the dataset.
42
+ - Performs descriptive statistics and missingness analysis.
43
+ - Discovers high correlations and skewness.
44
+ - Auto-detects the target column and runs target profiling.
45
+ - **Stunning HTML Reports (`report.save()`)**:
46
+ - Interactive, modern Glassmorphism design.
47
+ - Supports light/dark theme toggles.
48
+ - Fully responsive layout with embedded Base64 charts.
49
+ - Searchable variables, collapsible details, and clear data insights.
50
+
51
+ ## Installation
52
+
53
+ ```bash
54
+ pip install sagaranalysis
55
+ ```
56
+
57
+ ## Quick Start
58
+
59
+ ```python
60
+ import pandas as pd
61
+ import sagaranalysis as sa
62
+
63
+ # Load your dirty dataset
64
+ df = pd.read_csv("dirty_data.csv")
65
+
66
+ # Clean your data in one line
67
+ cleaned_df = sa.clean(df)
68
+
69
+ # Perform automatic EDA and generate insights
70
+ report = sa.analysis(cleaned_df)
71
+
72
+ # View or save the report
73
+ report.show() # Opens in local browser or renders in Jupyter Notebook
74
+ report.save("report.html") # Saves self-contained report to disk
75
+ ```
76
+
77
+ ## License
78
+
79
+ This project is licensed under the MIT License - see the LICENSE file for details.
@@ -0,0 +1,53 @@
1
+ # SagarAnalysis
2
+
3
+ A production-ready, beginner-friendly Python package designed for one-line data cleaning, one-line Exploratory Data Analysis (EDA), and stunning, glassmorphic HTML reporting.
4
+
5
+ ## Features
6
+
7
+ - **One-line Cleaning (`clean(df)`)**:
8
+ - Automatically standardizes column names to `snake_case`.
9
+ - Removes duplicate rows.
10
+ - Detects and drops constant/highly missing columns.
11
+ - Imputes missing values intelligently (median/mean/mode/unknown).
12
+ - Handles numerical outliers via clipping.
13
+ - Coerces data types (string to numeric, string to datetime, Yes/No to boolean).
14
+ - **One-line EDA & Report (`analysis(df)`)**:
15
+ - Generates a comprehensive overview of the dataset.
16
+ - Performs descriptive statistics and missingness analysis.
17
+ - Discovers high correlations and skewness.
18
+ - Auto-detects the target column and runs target profiling.
19
+ - **Stunning HTML Reports (`report.save()`)**:
20
+ - Interactive, modern Glassmorphism design.
21
+ - Supports light/dark theme toggles.
22
+ - Fully responsive layout with embedded Base64 charts.
23
+ - Searchable variables, collapsible details, and clear data insights.
24
+
25
+ ## Installation
26
+
27
+ ```bash
28
+ pip install sagaranalysis
29
+ ```
30
+
31
+ ## Quick Start
32
+
33
+ ```python
34
+ import pandas as pd
35
+ import sagaranalysis as sa
36
+
37
+ # Load your dirty dataset
38
+ df = pd.read_csv("dirty_data.csv")
39
+
40
+ # Clean your data in one line
41
+ cleaned_df = sa.clean(df)
42
+
43
+ # Perform automatic EDA and generate insights
44
+ report = sa.analysis(cleaned_df)
45
+
46
+ # View or save the report
47
+ report.show() # Opens in local browser or renders in Jupyter Notebook
48
+ report.save("report.html") # Saves self-contained report to disk
49
+ ```
50
+
51
+ ## License
52
+
53
+ This project is licensed under the MIT License - see the LICENSE file for details.
@@ -0,0 +1,36 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sagaranalysis"
7
+ version = "0.1.0"
8
+ authors = [
9
+ { name = "Sagar", email = "sagarmohite4895@gmail.com" }
10
+ ]
11
+ description = "A production-ready Python library for one-line data cleaning, EDA, and beautiful HTML reporting."
12
+ readme = "README.md"
13
+ requires-python = ">=3.8"
14
+ license = { text = "MIT" }
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ "Intended Audience :: Science/Research",
20
+ "Topic :: Scientific/Engineering :: Information Analysis",
21
+ ]
22
+ dependencies = [
23
+ "pandas>=1.3.0",
24
+ "numpy>=1.20.0",
25
+ "matplotlib>=3.4.0",
26
+ "seaborn>=0.11.0",
27
+ "jinja2>=3.0.0",
28
+ "scipy>=1.7.0"
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest>=7.0.0",
34
+ "black",
35
+ "ruff"
36
+ ]
@@ -0,0 +1,125 @@
1
+ import logging
2
+ from typing import Optional, Dict
3
+ import pandas as pd
4
+
5
+ # Setup package logging
6
+ logging.getLogger("sagaranalysis").addHandler(logging.NullHandler())
7
+
8
+ from sagaranalysis.cleaning import (
9
+ CleaningPipeline,
10
+ ColumnStandardizer,
11
+ DuplicateRemover,
12
+ ColumnDropper,
13
+ TypeCoercer,
14
+ MissingImputer,
15
+ OutlierHandler,
16
+ CleaningSummary
17
+ )
18
+ from sagaranalysis.eda import DatasetAnalyzer
19
+ from sagaranalysis.report import Report
20
+ from sagaranalysis.visualization import (
21
+ plot_missing_values,
22
+ plot_numerical_distribution,
23
+ plot_categorical_frequency,
24
+ plot_correlation_heatmap,
25
+ plot_target_relationship
26
+ )
27
+
28
+ def clean(
29
+ df: pd.DataFrame,
30
+ missing_threshold: float = 0.60,
31
+ constant_threshold: float = 0.99,
32
+ outlier_multiplier: float = 1.5
33
+ ) -> pd.DataFrame:
34
+ """
35
+ Cleans a pandas DataFrame in one line.
36
+
37
+ Standardizes column names, removes duplicates, drops constant or highly missing
38
+ columns, coerces types (dates, numbers-as-strings, booleans), imputes missing values
39
+ intelligently, and clips outliers.
40
+
41
+ Args:
42
+ df: The pandas DataFrame to clean.
43
+ missing_threshold: Percentage of null values above which a column is dropped (default 60%).
44
+ constant_threshold: Percentage of duplicate values above which a column is dropped (default 99%).
45
+ outlier_multiplier: IQR multiplier for clipping outliers (default 1.5).
46
+
47
+ Returns:
48
+ A cleaned copy of the DataFrame with `attrs['cleaning_summary']` set to a `CleaningSummary` object.
49
+ """
50
+ pipeline = CleaningPipeline()
51
+ pipeline.add_step(ColumnStandardizer())
52
+ pipeline.add_step(DuplicateRemover())
53
+ pipeline.add_step(ColumnDropper(missing_threshold, constant_threshold))
54
+ pipeline.add_step(TypeCoercer())
55
+ pipeline.add_step(MissingImputer())
56
+ pipeline.add_step(OutlierHandler(outlier_multiplier))
57
+
58
+ cleaned_df, summary = pipeline.run(df)
59
+ return cleaned_df
60
+
61
+
62
+ def analysis(df: pd.DataFrame, target: Optional[str] = None) -> Report:
63
+ """
64
+ Analyzes a pandas DataFrame in one line and returns an interactive Report object.
65
+
66
+ Extracts descriptive statistics, profiles each column, generates base64-encoded
67
+ visualizations, and runs automated insight checks.
68
+
69
+ Args:
70
+ df: The pandas DataFrame to analyze.
71
+ target: Optional target column name. If not specified, attempts to auto-detect.
72
+
73
+ Returns:
74
+ A Report object ready to show, save, summarize, or expose insights.
75
+ """
76
+ from halo import Halo
77
+ spinner = Halo(text="Analyzing dataset features...", spinner="dots", color="magenta")
78
+ spinner.start()
79
+
80
+ try:
81
+ # 1. Perform calculations
82
+ analyzer = DatasetAnalyzer(df, target=target)
83
+ results = analyzer.analyze()
84
+
85
+ detected_target = results["overview"]["target_detected"]
86
+ detected_target_type = results["target"]["type"] if results["target"] else None
87
+
88
+ # 2. Generate Base64 Visualizations
89
+ plots: Dict[str, str] = {}
90
+
91
+ # Missing values plot (returns None if no missing values)
92
+ missing_plot = plot_missing_values(df)
93
+ if missing_plot:
94
+ plots["missingness"] = missing_plot
95
+
96
+ # Column distributions
97
+ for col_name, col_meta in results["columns"].items():
98
+ series = df[col_name]
99
+ if col_meta["type"] == "numeric":
100
+ plots[col_name] = plot_numerical_distribution(series, col_name)
101
+ elif col_meta["type"] == "categorical":
102
+ plots[col_name] = plot_categorical_frequency(series, col_name)
103
+
104
+ # Correlation heatmap
105
+ corr_heatmap = plot_correlation_heatmap(df)
106
+ if corr_heatmap:
107
+ plots["correlation"] = corr_heatmap
108
+
109
+ # Target relationships
110
+ if detected_target and results["target"]:
111
+ relationships = results["target"].get("relationships", {})
112
+ for feat in relationships.keys():
113
+ plots[f"target_{feat}"] = plot_target_relationship(
114
+ df, feat, detected_target, detected_target_type
115
+ )
116
+
117
+ # 3. Retrieve any cleaning summary attached to the DataFrame
118
+ cleaning_summary = df.attrs.get("cleaning_summary")
119
+
120
+ spinner.succeed("Statistical profiling and visual analytics complete!")
121
+ # 4. Construct Report
122
+ return Report(results, plots, cleaning_summary=cleaning_summary)
123
+ except Exception as e:
124
+ spinner.fail(f"Analysis failed: {str(e)}")
125
+ raise e
@@ -0,0 +1,22 @@
1
+ from sagaranalysis.cleaning.base import CleaningStep
2
+ from sagaranalysis.cleaning.pipeline import CleaningPipeline, CleaningSummary
3
+ from sagaranalysis.cleaning.rules import (
4
+ ColumnStandardizer,
5
+ DuplicateRemover,
6
+ ColumnDropper,
7
+ TypeCoercer,
8
+ MissingImputer,
9
+ OutlierHandler
10
+ )
11
+
12
+ __all__ = [
13
+ "CleaningStep",
14
+ "CleaningPipeline",
15
+ "CleaningSummary",
16
+ "ColumnStandardizer",
17
+ "DuplicateRemover",
18
+ "ColumnDropper",
19
+ "TypeCoercer",
20
+ "MissingImputer",
21
+ "OutlierHandler"
22
+ ]
@@ -0,0 +1,23 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Tuple, List
3
+ import pandas as pd
4
+
5
+ class CleaningStep(ABC):
6
+ """
7
+ Abstract base class for a single data cleaning step.
8
+ Each step must implement `execute` and return a cleaned copy/view
9
+ of the DataFrame along with a list of logs describing what was changed.
10
+ """
11
+ @abstractmethod
12
+ def execute(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
13
+ """
14
+ Executes the cleaning operation.
15
+
16
+ Args:
17
+ df: The input pandas DataFrame.
18
+
19
+ Returns:
20
+ A tuple of (cleaned_df, logs_list) where logs_list contains
21
+ messages detailing changes.
22
+ """
23
+ pass
@@ -0,0 +1,118 @@
1
+ import time
2
+ import logging
3
+ from typing import List, Tuple
4
+ import pandas as pd
5
+ from sagaranalysis.cleaning.base import CleaningStep
6
+
7
+ logger = logging.getLogger("sagaranalysis.cleaning")
8
+
9
+ class CleaningSummary:
10
+ """
11
+ Holds metadata and logs about a finished data cleaning session.
12
+ """
13
+ def __init__(self, initial_shape: Tuple[int, int]):
14
+ self.initial_shape = initial_shape
15
+ self.final_shape = initial_shape
16
+ self.logs: List[str] = []
17
+ self.execution_time_seconds: float = 0.0
18
+
19
+ @property
20
+ def rows_removed(self) -> int:
21
+ return self.initial_shape[0] - self.final_shape[0]
22
+
23
+ @property
24
+ def cols_removed(self) -> int:
25
+ return self.initial_shape[1] - self.final_shape[1]
26
+
27
+ def add_logs(self, step_logs: List[str]) -> None:
28
+ self.logs.extend(step_logs)
29
+
30
+ def report(self) -> str:
31
+ """
32
+ Generates a clean markdown string detailing the cleaning summary.
33
+ """
34
+ report_lines = [
35
+ "# Data Cleaning Summary Report",
36
+ f"- **Execution Time:** {self.execution_time_seconds:.4f} seconds",
37
+ f"- **Initial Shape:** {self.initial_shape[0]} rows, {self.initial_shape[1]} columns",
38
+ f"- **Final Shape:** {self.final_shape[0]} rows, {self.final_shape[1]} columns",
39
+ f"- **Rows Removed:** {self.rows_removed}",
40
+ f"- **Columns Removed:** {self.cols_removed}",
41
+ "\n## Operations Applied:"
42
+ ]
43
+
44
+ if self.logs:
45
+ for log in self.logs:
46
+ report_lines.append(f"- {log}")
47
+ else:
48
+ report_lines.append("- No operations performed.")
49
+
50
+ return "\n".join(report_lines)
51
+
52
+ def __str__(self) -> str:
53
+ return self.report()
54
+
55
+ def __repr__(self) -> str:
56
+ return f"<CleaningSummary initial={self.initial_shape} final={self.final_shape} logs={len(self.logs)}>"
57
+
58
+
59
+ class CleaningPipeline:
60
+ """
61
+ Orchestrates the execution of multiple CleaningSteps.
62
+ """
63
+ def __init__(self):
64
+ self.steps: List[CleaningStep] = []
65
+
66
+ def add_step(self, step: CleaningStep) -> None:
67
+ """
68
+ Appends a cleaning step to the pipeline.
69
+ """
70
+ self.steps.append(step)
71
+
72
+ def run(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningSummary]:
73
+ """
74
+ Runs all cleaning steps in the pipeline sequentially.
75
+ """
76
+ from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn
77
+ from halo import Halo
78
+
79
+ start_time = time.time()
80
+ summary = CleaningSummary(df.shape)
81
+
82
+ cleaned_df = df.copy()
83
+
84
+ # Display rich progress bar
85
+ with Progress(
86
+ SpinnerColumn(spinner_name="dots", style="cyan"),
87
+ TextColumn("[cyan]{task.description}"),
88
+ BarColumn(bar_width=30, style="dim cyan", complete_style="cyan"),
89
+ TextColumn("[cyan]{task.percentage:>3.0f}%"),
90
+ transient=True
91
+ ) as progress:
92
+ task = progress.add_task("Executing cleaning pipeline...", total=len(self.steps))
93
+ for i, step in enumerate(self.steps):
94
+ step_name = step.__class__.__name__
95
+ progress.update(task, description=f"Step {i+1}/{len(self.steps)}: {step_name}")
96
+ try:
97
+ cleaned_df, step_logs = step.execute(cleaned_df)
98
+ summary.add_logs(step_logs)
99
+ except Exception as e:
100
+ logger.error(f"Error in cleaning step {step_name}: {e}", exc_info=True)
101
+ summary.add_logs([f"ERROR in {step_name}: {str(e)}"])
102
+
103
+ # Introduce a small artificial delay so that progress updates are readable to users
104
+ time.sleep(0.05)
105
+ progress.advance(task, 1)
106
+
107
+ # Show final success spinner
108
+ success_spinner = Halo(text="Dataset cleaned successfully!", spinner="dots", color="cyan")
109
+ success_spinner.start()
110
+ success_spinner.succeed()
111
+
112
+ summary.final_shape = cleaned_df.shape
113
+ summary.execution_time_seconds = time.time() - start_time
114
+
115
+ # Attach summary to the DataFrame attributes metadata
116
+ cleaned_df.attrs["cleaning_summary"] = summary
117
+
118
+ return cleaned_df, summary