hashprep 0.1.0a0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hashprep-0.1.0a0/LICENSE +21 -0
- hashprep-0.1.0a0/MANIFEST.in +13 -0
- hashprep-0.1.0a0/PKG-INFO +118 -0
- hashprep-0.1.0a0/README.md +72 -0
- hashprep-0.1.0a0/hashprep/__init__.py +3 -0
- hashprep-0.1.0a0/hashprep/analyzer.py +70 -0
- hashprep-0.1.0a0/hashprep/checks/__init__.py +34 -0
- hashprep-0.1.0a0/hashprep/checks/columns.py +91 -0
- hashprep-0.1.0a0/hashprep/checks/core.py +10 -0
- hashprep-0.1.0a0/hashprep/checks/correlations.py +114 -0
- hashprep-0.1.0a0/hashprep/checks/imbalance.py +18 -0
- hashprep-0.1.0a0/hashprep/checks/leakage.py +129 -0
- hashprep-0.1.0a0/hashprep/checks/missing_values.py +153 -0
- hashprep-0.1.0a0/hashprep/checks/outliers.py +109 -0
- hashprep-0.1.0a0/hashprep/cli/__init__.py +0 -0
- hashprep-0.1.0a0/hashprep/cli/main.py +196 -0
- hashprep-0.1.0a0/hashprep/reports/__init__.py +1 -0
- hashprep-0.1.0a0/hashprep/reports/generators.py +34 -0
- hashprep-0.1.0a0/hashprep/reports/html.py +133 -0
- hashprep-0.1.0a0/hashprep/reports/json.py +22 -0
- hashprep-0.1.0a0/hashprep/reports/markdown.py +50 -0
- hashprep-0.1.0a0/hashprep/reports/pdf.py +12 -0
- hashprep-0.1.0a0/hashprep/summaries/__init__.py +4 -0
- hashprep-0.1.0a0/hashprep/summaries/dataset.py +31 -0
- hashprep-0.1.0a0/hashprep/summaries/interactions.py +69 -0
- hashprep-0.1.0a0/hashprep/summaries/missing.py +17 -0
- hashprep-0.1.0a0/hashprep/summaries/variables.py +102 -0
- hashprep-0.1.0a0/hashprep.egg-info/PKG-INFO +118 -0
- hashprep-0.1.0a0/hashprep.egg-info/SOURCES.txt +33 -0
- hashprep-0.1.0a0/hashprep.egg-info/dependency_links.txt +1 -0
- hashprep-0.1.0a0/hashprep.egg-info/entry_points.txt +2 -0
- hashprep-0.1.0a0/hashprep.egg-info/requires.txt +10 -0
- hashprep-0.1.0a0/hashprep.egg-info/top_level.txt +1 -0
- hashprep-0.1.0a0/pyproject.toml +44 -0
- hashprep-0.1.0a0/setup.cfg +4 -0
hashprep-0.1.0a0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 HashPrep
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hashprep
|
|
3
|
+
Version: 0.1.0a0
|
|
4
|
+
Summary: A library for dataset quality checks, preprocessing, and report generation
|
|
5
|
+
Author-email: "Aftaab Siddiqui (MaskedSyntax)" <aftaab@aftaab.xyz>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 HashPrep
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/cachevector/hashprep
|
|
29
|
+
Project-URL: Repository, https://github.com/cachevector/hashprep
|
|
30
|
+
Project-URL: Documentation, https://github.com/cachevector/hashprep
|
|
31
|
+
Project-URL: Issues, https://github.com/cachevector/hashprep/issues
|
|
32
|
+
Requires-Python: >=3.10
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
License-File: LICENSE
|
|
35
|
+
Requires-Dist: click>=8.3.0
|
|
36
|
+
Requires-Dist: fastapi>=0.116.1
|
|
37
|
+
Requires-Dist: jinja2>=3.1.6
|
|
38
|
+
Requires-Dist: numpy>=2.2.6
|
|
39
|
+
Requires-Dist: pandas>=2.3.2
|
|
40
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
41
|
+
Requires-Dist: scikit-learn>=1.7.2
|
|
42
|
+
Requires-Dist: scipy>=1.15.3
|
|
43
|
+
Requires-Dist: tabulate>=0.9.0
|
|
44
|
+
Requires-Dist: weasyprint>=66.0
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
|
|
47
|
+
<div align="center">
|
|
48
|
+
<picture>
|
|
49
|
+
<source media="(prefers-color-scheme: dark)" srcset="docs/assets/hashprep-wobg.svg" width="100">
|
|
50
|
+
<img alt="HashPrep Logo" src="docs/assets/hashprep-dark.svg" width="100">
|
|
51
|
+
</picture>
|
|
52
|
+
|
|
53
|
+
<h1>HashPrep</h1>
|
|
54
|
+
<p>
|
|
55
|
+
<b> Dataset Profiler & Debugger for Machine Learning </b>
|
|
56
|
+
</p>
|
|
57
|
+
|
|
58
|
+
<p align="center">
|
|
59
|
+
<!-- Distribution -->
|
|
60
|
+
<!-- <img src="https://img.shields.io/pypi/v/hashprep?color=blue&label=PyPI" /> -->
|
|
61
|
+
<img src="https://img.shields.io/badge/PyPI-Coming%20Soon-blue" />
|
|
62
|
+
<!-- License -->
|
|
63
|
+
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
64
|
+
<img src="https://img.shields.io/badge/CLI-Supported-orange" />
|
|
65
|
+
</p>
|
|
66
|
+
<p>
|
|
67
|
+
<!-- Features -->
|
|
68
|
+
<img src="https://img.shields.io/badge/Feature-Dataset%20Quality%20Assurance-critical" />
|
|
69
|
+
<img src="https://img.shields.io/badge/Feature-Preprocessing%20%2B%20Profiling-blueviolet" />
|
|
70
|
+
<img src="https://img.shields.io/badge/Feature-Report%20Generation-3f4f75" />
|
|
71
|
+
<img src="https://img.shields.io/badge/Feature-Quick%20Fixes-success" />
|
|
72
|
+
</p>
|
|
73
|
+
</div>
|
|
74
|
+
|
|
75
|
+
> [!WARNING]
|
|
76
|
+
> This repository is under active development and may not be stable.
|
|
77
|
+
|
|
78
|
+
## Overview
|
|
79
|
+
|
|
80
|
+
**HashPrep** is a Python library for intelligent dataset profiling and debugging that acts as a comprehensive pre-training quality assurance tool for machine learning projects.
|
|
81
|
+
Think of it as **"Pandas Profiling + PyLint for datasets"**, designed specifically for machine learning workflows.
|
|
82
|
+
|
|
83
|
+
It catches critical dataset issues before they derail your ML pipeline, explains the problems, and suggests context-aware fixes.
|
|
84
|
+
If you want, HashPrep can even apply those fixes for you automatically.
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Features
|
|
90
|
+
|
|
91
|
+
Key features include:
|
|
92
|
+
|
|
93
|
+
- **Intelligent Profiling**: Detect missing values, skewed distributions, outliers, and data type inconsistencies.
|
|
94
|
+
- **ML-Specific Checks**: Identify data leakage, dataset drift, class imbalance, and high-cardinality features.
|
|
95
|
+
- **Automated Preparation**: Get suggestions for encoding, imputation, scaling, and transformations, and optionally apply them automatically.
|
|
96
|
+
- **Rich Reporting**: Generate statistical summaries and exportable reports for collaboration.
|
|
97
|
+
- **Production-Ready Pipelines**: Output reproducible cleaning and preprocessing code that integrates seamlessly with ML workflows.
|
|
98
|
+
|
|
99
|
+
HashPrep turns dataset debugging into a guided, automated process - saving time, improving model reliability, and standardizing best practices across teams.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## License
|
|
104
|
+
|
|
105
|
+
This project is licensed under the [**MIT License**](./LICENSE).
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Contributing
|
|
110
|
+
|
|
111
|
+
We welcome contributions from the community to make HashPrep better!
|
|
112
|
+
|
|
113
|
+
Before you get started, please:
|
|
114
|
+
|
|
115
|
+
- Review our [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed guidelines and setup instructions
|
|
116
|
+
- Write clean, well-documented code
|
|
117
|
+
- Follow best practices for the stack or component you’re working on
|
|
118
|
+
- Open a pull request (PR) with a clear description of your changes and motivation
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<picture>
|
|
3
|
+
<source media="(prefers-color-scheme: dark)" srcset="docs/assets/hashprep-wobg.svg" width="100">
|
|
4
|
+
<img alt="HashPrep Logo" src="docs/assets/hashprep-dark.svg" width="100">
|
|
5
|
+
</picture>
|
|
6
|
+
|
|
7
|
+
<h1>HashPrep</h1>
|
|
8
|
+
<p>
|
|
9
|
+
<b> Dataset Profiler & Debugger for Machine Learning </b>
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
<p align="center">
|
|
13
|
+
<!-- Distribution -->
|
|
14
|
+
<!-- <img src="https://img.shields.io/pypi/v/hashprep?color=blue&label=PyPI" /> -->
|
|
15
|
+
<img src="https://img.shields.io/badge/PyPI-Coming%20Soon-blue" />
|
|
16
|
+
<!-- License -->
|
|
17
|
+
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
18
|
+
<img src="https://img.shields.io/badge/CLI-Supported-orange" />
|
|
19
|
+
</p>
|
|
20
|
+
<p>
|
|
21
|
+
<!-- Features -->
|
|
22
|
+
<img src="https://img.shields.io/badge/Feature-Dataset%20Quality%20Assurance-critical" />
|
|
23
|
+
<img src="https://img.shields.io/badge/Feature-Preprocessing%20%2B%20Profiling-blueviolet" />
|
|
24
|
+
<img src="https://img.shields.io/badge/Feature-Report%20Generation-3f4f75" />
|
|
25
|
+
<img src="https://img.shields.io/badge/Feature-Quick%20Fixes-success" />
|
|
26
|
+
</p>
|
|
27
|
+
</div>
|
|
28
|
+
|
|
29
|
+
> [!WARNING]
|
|
30
|
+
> This repository is under active development and may not be stable.
|
|
31
|
+
|
|
32
|
+
## Overview
|
|
33
|
+
|
|
34
|
+
**HashPrep** is a Python library for intelligent dataset profiling and debugging that acts as a comprehensive pre-training quality assurance tool for machine learning projects.
|
|
35
|
+
Think of it as **"Pandas Profiling + PyLint for datasets"**, designed specifically for machine learning workflows.
|
|
36
|
+
|
|
37
|
+
It catches critical dataset issues before they derail your ML pipeline, explains the problems, and suggests context-aware fixes.
|
|
38
|
+
If you want, HashPrep can even apply those fixes for you automatically.
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Features
|
|
44
|
+
|
|
45
|
+
Key features include:
|
|
46
|
+
|
|
47
|
+
- **Intelligent Profiling**: Detect missing values, skewed distributions, outliers, and data type inconsistencies.
|
|
48
|
+
- **ML-Specific Checks**: Identify data leakage, dataset drift, class imbalance, and high-cardinality features.
|
|
49
|
+
- **Automated Preparation**: Get suggestions for encoding, imputation, scaling, and transformations, and optionally apply them automatically.
|
|
50
|
+
- **Rich Reporting**: Generate statistical summaries and exportable reports for collaboration.
|
|
51
|
+
- **Production-Ready Pipelines**: Output reproducible cleaning and preprocessing code that integrates seamlessly with ML workflows.
|
|
52
|
+
|
|
53
|
+
HashPrep turns dataset debugging into a guided, automated process - saving time, improving model reliability, and standardizing best practices across teams.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## License
|
|
58
|
+
|
|
59
|
+
This project is licensed under the [**MIT License**](./LICENSE).
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Contributing
|
|
64
|
+
|
|
65
|
+
We welcome contributions from the community to make HashPrep better!
|
|
66
|
+
|
|
67
|
+
Before you get started, please:
|
|
68
|
+
|
|
69
|
+
- Review our [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed guidelines and setup instructions
|
|
70
|
+
- Write clean, well-documented code
|
|
71
|
+
- Follow best practices for the stack or component you’re working on
|
|
72
|
+
- Open a pull request (PR) with a clear description of your changes and motivation
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from typing import Dict, List, Optional
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
from .checks import run_checks
|
|
5
|
+
from .summaries import (
|
|
6
|
+
get_dataset_preview,
|
|
7
|
+
summarize_dataset_info,
|
|
8
|
+
summarize_variable_types,
|
|
9
|
+
add_reproduction_info,
|
|
10
|
+
summarize_variables,
|
|
11
|
+
summarize_interactions,
|
|
12
|
+
summarize_missing_values,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
class DatasetAnalyzer:
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
df: pd.DataFrame,
|
|
19
|
+
target_col: Optional[str] = None,
|
|
20
|
+
selected_checks: Optional[List[str]] = None,
|
|
21
|
+
):
|
|
22
|
+
self.df = df
|
|
23
|
+
self.target_col = target_col
|
|
24
|
+
self.selected_checks = selected_checks
|
|
25
|
+
self.issues = []
|
|
26
|
+
self.summaries = {}
|
|
27
|
+
self.all_checks = [
|
|
28
|
+
"data_leakage", "high_missing_values", "empty_columns", "single_value_columns",
|
|
29
|
+
"target_leakage_patterns", "class_imbalance", "high_cardinality", "duplicates",
|
|
30
|
+
"mixed_data_types", "outliers", "feature_correlation", "categorical_correlation",
|
|
31
|
+
"mixed_correlation", "dataset_missingness", "high_zero_counts",
|
|
32
|
+
"extreme_text_lengths", "datetime_skew", "missing_patterns",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
def analyze(self) -> Dict:
|
|
36
|
+
"""Run all summaries and checks, return summary"""
|
|
37
|
+
self.summaries.update(get_dataset_preview(self.df))
|
|
38
|
+
self.summaries.update(summarize_dataset_info(self.df))
|
|
39
|
+
self.summaries["variable_types"] = summarize_variable_types(self.df)
|
|
40
|
+
self.summaries["reproduction_info"] = add_reproduction_info(self.df)
|
|
41
|
+
self.summaries["variables"] = summarize_variables(self.df)
|
|
42
|
+
self.summaries.update(summarize_interactions(self.df))
|
|
43
|
+
self.summaries.update(summarize_missing_values(self.df))
|
|
44
|
+
|
|
45
|
+
checks_to_run = self.all_checks if self.selected_checks is None else [
|
|
46
|
+
check for check in self.selected_checks if check in self.all_checks
|
|
47
|
+
]
|
|
48
|
+
self.issues = run_checks(self, checks_to_run)
|
|
49
|
+
|
|
50
|
+
return self._generate_summary()
|
|
51
|
+
|
|
52
|
+
def _generate_summary(self):
|
|
53
|
+
critical_issues = [i for i in self.issues if i.severity == "critical"]
|
|
54
|
+
warning_issues = [i for i in self.issues if i.severity == "warning"]
|
|
55
|
+
return {
|
|
56
|
+
"critical_count": len(critical_issues),
|
|
57
|
+
"warning_count": len(warning_issues),
|
|
58
|
+
"total_issues": len(self.issues),
|
|
59
|
+
"issues": [
|
|
60
|
+
{
|
|
61
|
+
"category": issue.category,
|
|
62
|
+
"severity": issue.severity,
|
|
63
|
+
"column": issue.column,
|
|
64
|
+
"description": issue.description,
|
|
65
|
+
"impact_score": issue.impact_score,
|
|
66
|
+
"quick_fix": issue.quick_fix,
|
|
67
|
+
} for issue in self.issues
|
|
68
|
+
],
|
|
69
|
+
"summaries": self.summaries,
|
|
70
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from .core import Issues
|
|
2
|
+
from .leakage import _check_data_leakage, _check_target_leakage_patterns
|
|
3
|
+
from .missing_values import _check_high_missing_values, _check_empty_columns, _check_dataset_missingness, _check_missing_patterns
|
|
4
|
+
from .columns import _check_single_value_columns, _check_high_cardinality, _check_duplicates, _check_mixed_data_types
|
|
5
|
+
from .outliers import _check_outliers, _check_high_zero_counts, _check_extreme_text_lengths, _check_datetime_skew
|
|
6
|
+
from .correlations import _check_feature_correlation, _check_categorical_correlation, _check_mixed_correlation
|
|
7
|
+
from .imbalance import _check_class_imbalance
|
|
8
|
+
|
|
9
|
+
CHECKS = {
|
|
10
|
+
"data_leakage": _check_data_leakage,
|
|
11
|
+
"high_missing_values": _check_high_missing_values,
|
|
12
|
+
"empty_columns": _check_empty_columns,
|
|
13
|
+
"single_value_columns": _check_single_value_columns,
|
|
14
|
+
"target_leakage_patterns": _check_target_leakage_patterns,
|
|
15
|
+
"class_imbalance": _check_class_imbalance,
|
|
16
|
+
"high_cardinality": _check_high_cardinality,
|
|
17
|
+
"duplicates": _check_duplicates,
|
|
18
|
+
"mixed_data_types": _check_mixed_data_types,
|
|
19
|
+
"outliers": _check_outliers,
|
|
20
|
+
"feature_correlation": _check_feature_correlation,
|
|
21
|
+
"categorical_correlation": _check_categorical_correlation,
|
|
22
|
+
"mixed_correlation": _check_mixed_correlation,
|
|
23
|
+
"dataset_missingness": _check_dataset_missingness,
|
|
24
|
+
"high_zero_counts": _check_high_zero_counts,
|
|
25
|
+
"extreme_text_lengths": _check_extreme_text_lengths,
|
|
26
|
+
"datetime_skew": _check_datetime_skew,
|
|
27
|
+
"missing_patterns": _check_missing_patterns,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def run_checks(analyzer, checks_to_run):
|
|
31
|
+
issues = []
|
|
32
|
+
for check in checks_to_run:
|
|
33
|
+
issues.extend(CHECKS[check](analyzer))
|
|
34
|
+
return issues
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from .core import Issues
|
|
2
|
+
|
|
3
|
+
def _check_single_value_columns(analyzer):
|
|
4
|
+
issues = []
|
|
5
|
+
for col in analyzer.df.columns:
|
|
6
|
+
if analyzer.df[col].nunique(dropna=True) == 1:
|
|
7
|
+
impact = "low" if col != analyzer.target_col else "high"
|
|
8
|
+
severity = "warning" if col != analyzer.target_col else "critical"
|
|
9
|
+
quick_fix = (
|
|
10
|
+
"Options: \n- Drop column: Not informative for modeling (Pros: Simplifies model; Cons: None).\n- Verify data: Ensure single value isn't an error (Pros: Validates data; Cons: Time-consuming)."
|
|
11
|
+
if col != analyzer.target_col
|
|
12
|
+
else "Options: \n- Redefine target: Replace with a more variable target (Pros: Enables modeling; Cons: Requires new data).\n- Stop analysis: Constant target prevents meaningful prediction (Pros: Avoids invalid model; Cons: Halts analysis)."
|
|
13
|
+
)
|
|
14
|
+
issues.append(
|
|
15
|
+
Issues(
|
|
16
|
+
category="single_value",
|
|
17
|
+
severity=severity,
|
|
18
|
+
column=col,
|
|
19
|
+
description=f"Column '{col}' contains only one unique value",
|
|
20
|
+
impact_score=impact,
|
|
21
|
+
quick_fix=quick_fix,
|
|
22
|
+
)
|
|
23
|
+
)
|
|
24
|
+
return issues
|
|
25
|
+
|
|
26
|
+
def _check_high_cardinality(analyzer, threshold: int = 100, critical_threshold: float = 0.9):
|
|
27
|
+
issues = []
|
|
28
|
+
categorical_cols = analyzer.df.select_dtypes(include="object").columns.tolist()
|
|
29
|
+
for col in categorical_cols:
|
|
30
|
+
unique_count = int(analyzer.df[col].nunique())
|
|
31
|
+
unique_ratio = float(unique_count / len(analyzer.df))
|
|
32
|
+
if unique_count > threshold:
|
|
33
|
+
severity = "critical" if unique_ratio > critical_threshold else "warning"
|
|
34
|
+
impact = "high" if severity == "critical" else "medium"
|
|
35
|
+
quick_fix = (
|
|
36
|
+
"Options: \n- Drop column: Avoids overfitting from unique identifiers (Pros: Simplifies model; Cons: Loses potential info).\n- Engineer feature: Extract patterns (e.g., titles from names) (Pros: Retains useful info; Cons: Requires domain knowledge).\n- Use hashing: Reduce dimensionality (Pros: Scalable; Cons: May lose interpretability)."
|
|
37
|
+
if severity == "critical"
|
|
38
|
+
else "Options: \n- Group rare categories: Reduce cardinality (Pros: Simplifies feature; Cons: May lose nuance).\n- Use feature hashing: Map to lower dimensions (Pros: Scalable; Cons: Less interpretable).\n- Retain and test: Evaluate feature importance (Pros: Data-driven; Cons: Risk of overfitting)."
|
|
39
|
+
)
|
|
40
|
+
issues.append(
|
|
41
|
+
Issues(
|
|
42
|
+
category="high_cardinality",
|
|
43
|
+
severity=severity,
|
|
44
|
+
column=col,
|
|
45
|
+
description=f"Column '{col}' has {unique_count} unique values ({unique_ratio:.1%} of rows)",
|
|
46
|
+
impact_score=impact,
|
|
47
|
+
quick_fix=quick_fix,
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
return issues
|
|
51
|
+
|
|
52
|
+
def _check_duplicates(analyzer):
|
|
53
|
+
issues = []
|
|
54
|
+
duplicate_rows = int(analyzer.df.duplicated().sum())
|
|
55
|
+
if duplicate_rows > 0:
|
|
56
|
+
duplicate_ratio = float(duplicate_rows / len(analyzer.df))
|
|
57
|
+
severity = "critical" if duplicate_ratio > 0.1 else "warning"
|
|
58
|
+
impact = "high" if severity == "critical" else "medium"
|
|
59
|
+
quick_fix = (
|
|
60
|
+
"Options: \n- Drop duplicates: Ensures data integrity (Pros: Cleaner data; Cons: May lose valid repeats).\n- Verify duplicates: Check if intentional (e.g., time-series) (Pros: Validates data; Cons: Time-consuming)."
|
|
61
|
+
if severity == "critical"
|
|
62
|
+
else "Options: \n- Drop duplicates: Simplifies dataset (Pros: Cleaner data; Cons: May lose valid repeats).\n- Keep duplicates: If meaningful (e.g., repeated events) (Pros: Retains info; Cons: May bias model).\n- Test impact: Evaluate model performance with/without duplicates (Pros: Data-driven; Cons: Requires computation)."
|
|
63
|
+
)
|
|
64
|
+
issues.append(
|
|
65
|
+
Issues(
|
|
66
|
+
category="duplicates",
|
|
67
|
+
severity=severity,
|
|
68
|
+
column="__all__",
|
|
69
|
+
description=f"Dataset contains {duplicate_rows} duplicate rows ({duplicate_ratio:.1%} of rows)",
|
|
70
|
+
impact_score=impact,
|
|
71
|
+
quick_fix=quick_fix,
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
return issues
|
|
75
|
+
|
|
76
|
+
def _check_mixed_data_types(analyzer):
|
|
77
|
+
issues = []
|
|
78
|
+
for col in analyzer.df.columns:
|
|
79
|
+
types = analyzer.df[col].dropna().map(type).nunique()
|
|
80
|
+
if types > 1:
|
|
81
|
+
issues.append(
|
|
82
|
+
Issues(
|
|
83
|
+
category="mixed_types",
|
|
84
|
+
severity="warning",
|
|
85
|
+
column=col,
|
|
86
|
+
description=f"Column '{col}' contains mixed data types",
|
|
87
|
+
impact_score="low",
|
|
88
|
+
quick_fix="Options: \n- Cast to single type: Ensure consistency (Pros: Simplifies processing; Cons: May lose nuance).\n- Split column: Separate types into new features (Pros: Preserves info; Cons: Adds complexity).\n- Investigate source: Check data collection errors (Pros: Improves quality; Cons: Time-consuming).",
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
return issues
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from .core import Issues
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from scipy.stats import chi2_contingency, f_oneway
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
def _check_feature_correlation(analyzer, threshold: float = 0.95, critical_threshold: float = 0.98):
|
|
7
|
+
issues = []
|
|
8
|
+
numeric_df = analyzer.df.select_dtypes(include="number")
|
|
9
|
+
if numeric_df.empty:
|
|
10
|
+
return issues
|
|
11
|
+
corr_matrix = numeric_df.corr().abs()
|
|
12
|
+
upper = corr_matrix.where(np.tril(np.ones(corr_matrix.shape)).astype(bool))
|
|
13
|
+
correlated_pairs = [
|
|
14
|
+
(col, row, float(val))
|
|
15
|
+
for row in upper.index
|
|
16
|
+
for col, val in upper[row].dropna().items()
|
|
17
|
+
if val > threshold and col != row
|
|
18
|
+
]
|
|
19
|
+
for col1, col2, corr in correlated_pairs:
|
|
20
|
+
severity = "critical" if corr > critical_threshold else "warning"
|
|
21
|
+
impact = "high" if severity == "critical" else "medium"
|
|
22
|
+
quick_fix = (
|
|
23
|
+
"Options: \n- Drop one feature: Reduces multicollinearity (Pros: Simplifies model; Cons: Loses info).\n- Combine features: Create composite feature (e.g., PCA) (Pros: Retains info; Cons: Less interpretable).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
|
|
24
|
+
if severity == "critical"
|
|
25
|
+
else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of multicollinearity).\n- Engineer feature: Combine or transform features (Pros: Reduces redundancy; Cons: Adds complexity)."
|
|
26
|
+
)
|
|
27
|
+
issues.append(
|
|
28
|
+
Issues(
|
|
29
|
+
category="feature_correlation",
|
|
30
|
+
severity=severity,
|
|
31
|
+
column=f"{col1},{col2}",
|
|
32
|
+
description=f"Columns '{col1}' and '{col2}' are highly correlated ({corr:.2f})",
|
|
33
|
+
impact_score=impact,
|
|
34
|
+
quick_fix=quick_fix,
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
return issues
|
|
38
|
+
|
|
39
|
+
def _check_categorical_correlation(analyzer, threshold: float = 0.8, critical_threshold: float = 0.95):
|
|
40
|
+
issues = []
|
|
41
|
+
categorical = analyzer.df.select_dtypes(include="object").columns.tolist()
|
|
42
|
+
for i, c1 in enumerate(categorical):
|
|
43
|
+
for c2 in categorical[i + 1 :]:
|
|
44
|
+
try:
|
|
45
|
+
table = pd.crosstab(analyzer.df[c1], analyzer.df[c2])
|
|
46
|
+
chi2, _, _, _ = chi2_contingency(table)
|
|
47
|
+
n = table.sum().sum()
|
|
48
|
+
phi2 = chi2 / n
|
|
49
|
+
r, k = table.shape
|
|
50
|
+
cramers_v = np.sqrt(phi2 / min(k - 1, r - 1))
|
|
51
|
+
if cramers_v > threshold:
|
|
52
|
+
severity = "critical" if cramers_v > critical_threshold else "warning"
|
|
53
|
+
impact = "high" if severity == "critical" else "medium"
|
|
54
|
+
quick_fix = (
|
|
55
|
+
"Options: \n- Drop one feature: Avoids overfitting from high redundancy (Pros: Simplifies model; Cons: Loses info).\n- Engineer feature: Extract common patterns (e.g., group categories) (Pros: Retains info; Cons: Requires domain knowledge).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
|
|
56
|
+
if severity == "critical"
|
|
57
|
+
else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of redundancy).\n- Engineer feature: Group categories or encode differently (Pros: Reduces redundancy; Cons: Adds complexity)."
|
|
58
|
+
)
|
|
59
|
+
issues.append(
|
|
60
|
+
Issues(
|
|
61
|
+
category="feature_correlation",
|
|
62
|
+
severity=severity,
|
|
63
|
+
column=f"{c1},{c2}",
|
|
64
|
+
description=f"Columns '{c1}' and '{c2}' are highly associated (Cramer's V: {float(cramers_v):.2f})",
|
|
65
|
+
impact_score=impact,
|
|
66
|
+
quick_fix=quick_fix,
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
except Exception:
|
|
70
|
+
continue
|
|
71
|
+
return issues
|
|
72
|
+
|
|
73
|
+
def _check_mixed_correlation(analyzer, p_threshold: float = 0.05, critical_p_threshold: float = 0.001):
|
|
74
|
+
issues = []
|
|
75
|
+
cat_cols = analyzer.df.select_dtypes(
|
|
76
|
+
include=["object", "category"]
|
|
77
|
+
).columns.tolist()
|
|
78
|
+
num_cols = analyzer.df.select_dtypes(include=["int64", "float64"]).columns.tolist()
|
|
79
|
+
for cat in cat_cols:
|
|
80
|
+
for num in num_cols:
|
|
81
|
+
groups = [
|
|
82
|
+
analyzer.df.loc[analyzer.df[cat] == level, num].dropna().to_numpy()
|
|
83
|
+
for level in analyzer.df[cat].dropna().unique()
|
|
84
|
+
if len(analyzer.df.loc[analyzer.df[cat] == level, num].dropna()) > 1
|
|
85
|
+
]
|
|
86
|
+
if len(groups) < 2 or all(np.var(g, ddof=1) == 0 for g in groups):
|
|
87
|
+
continue
|
|
88
|
+
try:
|
|
89
|
+
f_stat, p_val = f_oneway(*groups)
|
|
90
|
+
if p_val < p_threshold:
|
|
91
|
+
severity = (
|
|
92
|
+
"critical"
|
|
93
|
+
if p_val < critical_p_threshold and f_stat > 20.0
|
|
94
|
+
else "warning"
|
|
95
|
+
)
|
|
96
|
+
impact = "high" if severity == "critical" else "medium"
|
|
97
|
+
quick_fix = (
|
|
98
|
+
"Options: \n- Drop one feature: Avoids redundancy (Pros: Simplifies model; Cons: Loses info).\n- Engineer feature: Transform categorical or numeric feature (Pros: Retains info; Cons: Adds complexity).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
|
|
99
|
+
if severity == "critical"
|
|
100
|
+
else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of redundancy).\n- Engineer feature: Transform or encode differently (Pros: Reduces redundancy; Cons: Adds complexity)."
|
|
101
|
+
)
|
|
102
|
+
issues.append(
|
|
103
|
+
Issues(
|
|
104
|
+
category="feature_correlation",
|
|
105
|
+
severity=severity,
|
|
106
|
+
column=f"{cat},{num}",
|
|
107
|
+
description=f"Columns '{cat}' and '{num}' show strong association (F: {float(f_stat):.2f}, p: {float(p_val):.4f})",
|
|
108
|
+
impact_score=impact,
|
|
109
|
+
quick_fix=quick_fix,
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
except Exception:
|
|
113
|
+
continue
|
|
114
|
+
return issues
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from .core import Issues
|
|
2
|
+
|
|
3
|
+
def _check_class_imbalance(analyzer, threshold: float = 0.9):
|
|
4
|
+
issues = []
|
|
5
|
+
if analyzer.target_col and analyzer.target_col in analyzer.df.columns:
|
|
6
|
+
counts = analyzer.df[analyzer.target_col].value_counts(normalize=True)
|
|
7
|
+
if counts.max() > threshold:
|
|
8
|
+
issues.append(
|
|
9
|
+
Issues(
|
|
10
|
+
category="class_imbalance",
|
|
11
|
+
severity="warning",
|
|
12
|
+
column=analyzer.target_col,
|
|
13
|
+
description=f"Target '{analyzer.target_col}' is imbalanced ({float(counts.max()):.1%} in one class)",
|
|
14
|
+
impact_score="medium",
|
|
15
|
+
quick_fix="Options: \n- Resample data: Use oversampling (e.g., SMOTE) or undersampling (Pros: Balances classes; Cons: May introduce bias or lose data).\n- Use class weights: Adjust model weights for imbalance (Pros: Simple; Cons: Model-dependent).\n- Stratified sampling: Ensure balanced splits in training (Pros: Improves evaluation; Cons: Requires careful implementation).",
|
|
16
|
+
)
|
|
17
|
+
)
|
|
18
|
+
return issues
|