hashprep 0.1.0a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. hashprep-0.1.0a0/LICENSE +21 -0
  2. hashprep-0.1.0a0/MANIFEST.in +13 -0
  3. hashprep-0.1.0a0/PKG-INFO +118 -0
  4. hashprep-0.1.0a0/README.md +72 -0
  5. hashprep-0.1.0a0/hashprep/__init__.py +3 -0
  6. hashprep-0.1.0a0/hashprep/analyzer.py +70 -0
  7. hashprep-0.1.0a0/hashprep/checks/__init__.py +34 -0
  8. hashprep-0.1.0a0/hashprep/checks/columns.py +91 -0
  9. hashprep-0.1.0a0/hashprep/checks/core.py +10 -0
  10. hashprep-0.1.0a0/hashprep/checks/correlations.py +114 -0
  11. hashprep-0.1.0a0/hashprep/checks/imbalance.py +18 -0
  12. hashprep-0.1.0a0/hashprep/checks/leakage.py +129 -0
  13. hashprep-0.1.0a0/hashprep/checks/missing_values.py +153 -0
  14. hashprep-0.1.0a0/hashprep/checks/outliers.py +109 -0
  15. hashprep-0.1.0a0/hashprep/cli/__init__.py +0 -0
  16. hashprep-0.1.0a0/hashprep/cli/main.py +196 -0
  17. hashprep-0.1.0a0/hashprep/reports/__init__.py +1 -0
  18. hashprep-0.1.0a0/hashprep/reports/generators.py +34 -0
  19. hashprep-0.1.0a0/hashprep/reports/html.py +133 -0
  20. hashprep-0.1.0a0/hashprep/reports/json.py +22 -0
  21. hashprep-0.1.0a0/hashprep/reports/markdown.py +50 -0
  22. hashprep-0.1.0a0/hashprep/reports/pdf.py +12 -0
  23. hashprep-0.1.0a0/hashprep/summaries/__init__.py +4 -0
  24. hashprep-0.1.0a0/hashprep/summaries/dataset.py +31 -0
  25. hashprep-0.1.0a0/hashprep/summaries/interactions.py +69 -0
  26. hashprep-0.1.0a0/hashprep/summaries/missing.py +17 -0
  27. hashprep-0.1.0a0/hashprep/summaries/variables.py +102 -0
  28. hashprep-0.1.0a0/hashprep.egg-info/PKG-INFO +118 -0
  29. hashprep-0.1.0a0/hashprep.egg-info/SOURCES.txt +33 -0
  30. hashprep-0.1.0a0/hashprep.egg-info/dependency_links.txt +1 -0
  31. hashprep-0.1.0a0/hashprep.egg-info/entry_points.txt +2 -0
  32. hashprep-0.1.0a0/hashprep.egg-info/requires.txt +10 -0
  33. hashprep-0.1.0a0/hashprep.egg-info/top_level.txt +1 -0
  34. hashprep-0.1.0a0/pyproject.toml +44 -0
  35. hashprep-0.1.0a0/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 HashPrep
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,13 @@
1
+ include README.md LICENSE
2
+ recursive-include hashprep *.py
3
+
4
+ prune datasets
5
+ prune examples
6
+ prune tests
7
+ prune docs
8
+ prune web
9
+ prune scripts
10
+ exclude *.pyc
11
+ exclude __pycache__/*
12
+ exclude test.json
13
+ exclude todo.md
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: hashprep
3
+ Version: 0.1.0a0
4
+ Summary: A library for dataset quality checks, preprocessing, and report generation
5
+ Author-email: "Aftaab Siddiqui (MaskedSyntax)" <aftaab@aftaab.xyz>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 HashPrep
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/cachevector/hashprep
29
+ Project-URL: Repository, https://github.com/cachevector/hashprep
30
+ Project-URL: Documentation, https://github.com/cachevector/hashprep
31
+ Project-URL: Issues, https://github.com/cachevector/hashprep/issues
32
+ Requires-Python: >=3.10
33
+ Description-Content-Type: text/markdown
34
+ License-File: LICENSE
35
+ Requires-Dist: click>=8.3.0
36
+ Requires-Dist: fastapi>=0.116.1
37
+ Requires-Dist: jinja2>=3.1.6
38
+ Requires-Dist: numpy>=2.2.6
39
+ Requires-Dist: pandas>=2.3.2
40
+ Requires-Dist: pyyaml>=6.0.2
41
+ Requires-Dist: scikit-learn>=1.7.2
42
+ Requires-Dist: scipy>=1.15.3
43
+ Requires-Dist: tabulate>=0.9.0
44
+ Requires-Dist: weasyprint>=66.0
45
+ Dynamic: license-file
46
+
47
+ <div align="center">
48
+ <picture>
49
+ <source media="(prefers-color-scheme: dark)" srcset="docs/assets/hashprep-wobg.svg" width="100">
50
+ <img alt="HashPrep Logo" src="docs/assets/hashprep-dark.svg" width="100">
51
+ </picture>
52
+
53
+ <h1>HashPrep</h1>
54
+ <p>
55
+ <b> Dataset Profiler & Debugger for Machine Learning </b>
56
+ </p>
57
+
58
+ <p align="center">
59
+ <!-- Distribution -->
60
+ <!-- <img src="https://img.shields.io/pypi/v/hashprep?color=blue&label=PyPI" /> -->
61
+ <img src="https://img.shields.io/badge/PyPI-Coming%20Soon-blue" />
62
+ <!-- License -->
63
+ <img src="https://img.shields.io/badge/License-MIT-green" />
64
+ <img src="https://img.shields.io/badge/CLI-Supported-orange" />
65
+ </p>
66
+ <p>
67
+ <!-- Features -->
68
+ <img src="https://img.shields.io/badge/Feature-Dataset%20Quality%20Assurance-critical" />
69
+ <img src="https://img.shields.io/badge/Feature-Preprocessing%20%2B%20Profiling-blueviolet" />
70
+ <img src="https://img.shields.io/badge/Feature-Report%20Generation-3f4f75" />
71
+ <img src="https://img.shields.io/badge/Feature-Quick%20Fixes-success" />
72
+ </p>
73
+ </div>
74
+
75
+ > [!WARNING]
76
+ > This repository is under active development and may not be stable.
77
+
78
+ ## Overview
79
+
80
+ **HashPrep** is a Python library for intelligent dataset profiling and debugging that acts as a comprehensive pre-training quality assurance tool for machine learning projects.
81
+ Think of it as **"Pandas Profiling + PyLint for datasets"**, designed specifically for machine learning workflows.
82
+
83
+ It catches critical dataset issues before they derail your ML pipeline, explains the problems, and suggests context-aware fixes.
84
+ If you want, HashPrep can even apply those fixes for you automatically.
85
+
86
+
87
+ ---
88
+
89
+ ## Features
90
+
91
+ Key features include:
92
+
93
+ - **Intelligent Profiling**: Detect missing values, skewed distributions, outliers, and data type inconsistencies.
94
+ - **ML-Specific Checks**: Identify data leakage, dataset drift, class imbalance, and high-cardinality features.
95
+ - **Automated Preparation**: Get suggestions for encoding, imputation, scaling, and transformations, and optionally apply them automatically.
96
+ - **Rich Reporting**: Generate statistical summaries and exportable reports for collaboration.
97
+ - **Production-Ready Pipelines**: Output reproducible cleaning and preprocessing code that integrates seamlessly with ML workflows.
98
+
99
+ HashPrep turns dataset debugging into a guided, automated process - saving time, improving model reliability, and standardizing best practices across teams.
100
+
101
+ ---
102
+
103
+ ## License
104
+
105
+ This project is licensed under the [**MIT License**](./LICENSE).
106
+
107
+ ---
108
+
109
+ ## Contributing
110
+
111
+ We welcome contributions from the community to make HashPrep better!
112
+
113
+ Before you get started, please:
114
+
115
+ - Review our [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed guidelines and setup instructions
116
+ - Write clean, well-documented code
117
+ - Follow best practices for the stack or component you’re working on
118
+ - Open a pull request (PR) with a clear description of your changes and motivation
@@ -0,0 +1,72 @@
1
+ <div align="center">
2
+ <picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset="docs/assets/hashprep-wobg.svg" width="100">
4
+ <img alt="HashPrep Logo" src="docs/assets/hashprep-dark.svg" width="100">
5
+ </picture>
6
+
7
+ <h1>HashPrep</h1>
8
+ <p>
9
+ <b> Dataset Profiler & Debugger for Machine Learning </b>
10
+ </p>
11
+
12
+ <p align="center">
13
+ <!-- Distribution -->
14
+ <!-- <img src="https://img.shields.io/pypi/v/hashprep?color=blue&label=PyPI" /> -->
15
+ <img src="https://img.shields.io/badge/PyPI-Coming%20Soon-blue" />
16
+ <!-- License -->
17
+ <img src="https://img.shields.io/badge/License-MIT-green" />
18
+ <img src="https://img.shields.io/badge/CLI-Supported-orange" />
19
+ </p>
20
+ <p>
21
+ <!-- Features -->
22
+ <img src="https://img.shields.io/badge/Feature-Dataset%20Quality%20Assurance-critical" />
23
+ <img src="https://img.shields.io/badge/Feature-Preprocessing%20%2B%20Profiling-blueviolet" />
24
+ <img src="https://img.shields.io/badge/Feature-Report%20Generation-3f4f75" />
25
+ <img src="https://img.shields.io/badge/Feature-Quick%20Fixes-success" />
26
+ </p>
27
+ </div>
28
+
29
+ > [!WARNING]
30
+ > This repository is under active development and may not be stable.
31
+
32
+ ## Overview
33
+
34
+ **HashPrep** is a Python library for intelligent dataset profiling and debugging that acts as a comprehensive pre-training quality assurance tool for machine learning projects.
35
+ Think of it as **"Pandas Profiling + PyLint for datasets"**, designed specifically for machine learning workflows.
36
+
37
+ It catches critical dataset issues before they derail your ML pipeline, explains the problems, and suggests context-aware fixes.
38
+ If you want, HashPrep can even apply those fixes for you automatically.
39
+
40
+
41
+ ---
42
+
43
+ ## Features
44
+
45
+ Key features include:
46
+
47
+ - **Intelligent Profiling**: Detect missing values, skewed distributions, outliers, and data type inconsistencies.
48
+ - **ML-Specific Checks**: Identify data leakage, dataset drift, class imbalance, and high-cardinality features.
49
+ - **Automated Preparation**: Get suggestions for encoding, imputation, scaling, and transformations, and optionally apply them automatically.
50
+ - **Rich Reporting**: Generate statistical summaries and exportable reports for collaboration.
51
+ - **Production-Ready Pipelines**: Output reproducible cleaning and preprocessing code that integrates seamlessly with ML workflows.
52
+
53
+ HashPrep turns dataset debugging into a guided, automated process - saving time, improving model reliability, and standardizing best practices across teams.
54
+
55
+ ---
56
+
57
+ ## License
58
+
59
+ This project is licensed under the [**MIT License**](./LICENSE).
60
+
61
+ ---
62
+
63
+ ## Contributing
64
+
65
+ We welcome contributions from the community to make HashPrep better!
66
+
67
+ Before you get started, please:
68
+
69
+ - Review our [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed guidelines and setup instructions
70
+ - Write clean, well-documented code
71
+ - Follow best practices for the stack or component you’re working on
72
+ - Open a pull request (PR) with a clear description of your changes and motivation
@@ -0,0 +1,3 @@
1
+ from .analyzer import DatasetAnalyzer
2
+
3
+ __version__ = "0.1.0-alpha"
@@ -0,0 +1,70 @@
1
+ from typing import Dict, List, Optional
2
+ import pandas as pd
3
+
4
+ from .checks import run_checks
5
+ from .summaries import (
6
+ get_dataset_preview,
7
+ summarize_dataset_info,
8
+ summarize_variable_types,
9
+ add_reproduction_info,
10
+ summarize_variables,
11
+ summarize_interactions,
12
+ summarize_missing_values,
13
+ )
14
+
15
+ class DatasetAnalyzer:
16
+ def __init__(
17
+ self,
18
+ df: pd.DataFrame,
19
+ target_col: Optional[str] = None,
20
+ selected_checks: Optional[List[str]] = None,
21
+ ):
22
+ self.df = df
23
+ self.target_col = target_col
24
+ self.selected_checks = selected_checks
25
+ self.issues = []
26
+ self.summaries = {}
27
+ self.all_checks = [
28
+ "data_leakage", "high_missing_values", "empty_columns", "single_value_columns",
29
+ "target_leakage_patterns", "class_imbalance", "high_cardinality", "duplicates",
30
+ "mixed_data_types", "outliers", "feature_correlation", "categorical_correlation",
31
+ "mixed_correlation", "dataset_missingness", "high_zero_counts",
32
+ "extreme_text_lengths", "datetime_skew", "missing_patterns",
33
+ ]
34
+
35
+ def analyze(self) -> Dict:
36
+ """Run all summaries and checks, return summary"""
37
+ self.summaries.update(get_dataset_preview(self.df))
38
+ self.summaries.update(summarize_dataset_info(self.df))
39
+ self.summaries["variable_types"] = summarize_variable_types(self.df)
40
+ self.summaries["reproduction_info"] = add_reproduction_info(self.df)
41
+ self.summaries["variables"] = summarize_variables(self.df)
42
+ self.summaries.update(summarize_interactions(self.df))
43
+ self.summaries.update(summarize_missing_values(self.df))
44
+
45
+ checks_to_run = self.all_checks if self.selected_checks is None else [
46
+ check for check in self.selected_checks if check in self.all_checks
47
+ ]
48
+ self.issues = run_checks(self, checks_to_run)
49
+
50
+ return self._generate_summary()
51
+
52
+ def _generate_summary(self):
53
+ critical_issues = [i for i in self.issues if i.severity == "critical"]
54
+ warning_issues = [i for i in self.issues if i.severity == "warning"]
55
+ return {
56
+ "critical_count": len(critical_issues),
57
+ "warning_count": len(warning_issues),
58
+ "total_issues": len(self.issues),
59
+ "issues": [
60
+ {
61
+ "category": issue.category,
62
+ "severity": issue.severity,
63
+ "column": issue.column,
64
+ "description": issue.description,
65
+ "impact_score": issue.impact_score,
66
+ "quick_fix": issue.quick_fix,
67
+ } for issue in self.issues
68
+ ],
69
+ "summaries": self.summaries,
70
+ }
@@ -0,0 +1,34 @@
1
+ from .core import Issues
2
+ from .leakage import _check_data_leakage, _check_target_leakage_patterns
3
+ from .missing_values import _check_high_missing_values, _check_empty_columns, _check_dataset_missingness, _check_missing_patterns
4
+ from .columns import _check_single_value_columns, _check_high_cardinality, _check_duplicates, _check_mixed_data_types
5
+ from .outliers import _check_outliers, _check_high_zero_counts, _check_extreme_text_lengths, _check_datetime_skew
6
+ from .correlations import _check_feature_correlation, _check_categorical_correlation, _check_mixed_correlation
7
+ from .imbalance import _check_class_imbalance
8
+
9
+ CHECKS = {
10
+ "data_leakage": _check_data_leakage,
11
+ "high_missing_values": _check_high_missing_values,
12
+ "empty_columns": _check_empty_columns,
13
+ "single_value_columns": _check_single_value_columns,
14
+ "target_leakage_patterns": _check_target_leakage_patterns,
15
+ "class_imbalance": _check_class_imbalance,
16
+ "high_cardinality": _check_high_cardinality,
17
+ "duplicates": _check_duplicates,
18
+ "mixed_data_types": _check_mixed_data_types,
19
+ "outliers": _check_outliers,
20
+ "feature_correlation": _check_feature_correlation,
21
+ "categorical_correlation": _check_categorical_correlation,
22
+ "mixed_correlation": _check_mixed_correlation,
23
+ "dataset_missingness": _check_dataset_missingness,
24
+ "high_zero_counts": _check_high_zero_counts,
25
+ "extreme_text_lengths": _check_extreme_text_lengths,
26
+ "datetime_skew": _check_datetime_skew,
27
+ "missing_patterns": _check_missing_patterns,
28
+ }
29
+
30
+ def run_checks(analyzer, checks_to_run):
31
+ issues = []
32
+ for check in checks_to_run:
33
+ issues.extend(CHECKS[check](analyzer))
34
+ return issues
@@ -0,0 +1,91 @@
1
+ from .core import Issues
2
+
3
+ def _check_single_value_columns(analyzer):
4
+ issues = []
5
+ for col in analyzer.df.columns:
6
+ if analyzer.df[col].nunique(dropna=True) == 1:
7
+ impact = "low" if col != analyzer.target_col else "high"
8
+ severity = "warning" if col != analyzer.target_col else "critical"
9
+ quick_fix = (
10
+ "Options: \n- Drop column: Not informative for modeling (Pros: Simplifies model; Cons: None).\n- Verify data: Ensure single value isn't an error (Pros: Validates data; Cons: Time-consuming)."
11
+ if col != analyzer.target_col
12
+ else "Options: \n- Redefine target: Replace with a more variable target (Pros: Enables modeling; Cons: Requires new data).\n- Stop analysis: Constant target prevents meaningful prediction (Pros: Avoids invalid model; Cons: Halts analysis)."
13
+ )
14
+ issues.append(
15
+ Issues(
16
+ category="single_value",
17
+ severity=severity,
18
+ column=col,
19
+ description=f"Column '{col}' contains only one unique value",
20
+ impact_score=impact,
21
+ quick_fix=quick_fix,
22
+ )
23
+ )
24
+ return issues
25
+
26
+ def _check_high_cardinality(analyzer, threshold: int = 100, critical_threshold: float = 0.9):
27
+ issues = []
28
+ categorical_cols = analyzer.df.select_dtypes(include="object").columns.tolist()
29
+ for col in categorical_cols:
30
+ unique_count = int(analyzer.df[col].nunique())
31
+ unique_ratio = float(unique_count / len(analyzer.df))
32
+ if unique_count > threshold:
33
+ severity = "critical" if unique_ratio > critical_threshold else "warning"
34
+ impact = "high" if severity == "critical" else "medium"
35
+ quick_fix = (
36
+ "Options: \n- Drop column: Avoids overfitting from unique identifiers (Pros: Simplifies model; Cons: Loses potential info).\n- Engineer feature: Extract patterns (e.g., titles from names) (Pros: Retains useful info; Cons: Requires domain knowledge).\n- Use hashing: Reduce dimensionality (Pros: Scalable; Cons: May lose interpretability)."
37
+ if severity == "critical"
38
+ else "Options: \n- Group rare categories: Reduce cardinality (Pros: Simplifies feature; Cons: May lose nuance).\n- Use feature hashing: Map to lower dimensions (Pros: Scalable; Cons: Less interpretable).\n- Retain and test: Evaluate feature importance (Pros: Data-driven; Cons: Risk of overfitting)."
39
+ )
40
+ issues.append(
41
+ Issues(
42
+ category="high_cardinality",
43
+ severity=severity,
44
+ column=col,
45
+ description=f"Column '{col}' has {unique_count} unique values ({unique_ratio:.1%} of rows)",
46
+ impact_score=impact,
47
+ quick_fix=quick_fix,
48
+ )
49
+ )
50
+ return issues
51
+
52
+ def _check_duplicates(analyzer):
53
+ issues = []
54
+ duplicate_rows = int(analyzer.df.duplicated().sum())
55
+ if duplicate_rows > 0:
56
+ duplicate_ratio = float(duplicate_rows / len(analyzer.df))
57
+ severity = "critical" if duplicate_ratio > 0.1 else "warning"
58
+ impact = "high" if severity == "critical" else "medium"
59
+ quick_fix = (
60
+ "Options: \n- Drop duplicates: Ensures data integrity (Pros: Cleaner data; Cons: May lose valid repeats).\n- Verify duplicates: Check if intentional (e.g., time-series) (Pros: Validates data; Cons: Time-consuming)."
61
+ if severity == "critical"
62
+ else "Options: \n- Drop duplicates: Simplifies dataset (Pros: Cleaner data; Cons: May lose valid repeats).\n- Keep duplicates: If meaningful (e.g., repeated events) (Pros: Retains info; Cons: May bias model).\n- Test impact: Evaluate model performance with/without duplicates (Pros: Data-driven; Cons: Requires computation)."
63
+ )
64
+ issues.append(
65
+ Issues(
66
+ category="duplicates",
67
+ severity=severity,
68
+ column="__all__",
69
+ description=f"Dataset contains {duplicate_rows} duplicate rows ({duplicate_ratio:.1%} of rows)",
70
+ impact_score=impact,
71
+ quick_fix=quick_fix,
72
+ )
73
+ )
74
+ return issues
75
+
76
+ def _check_mixed_data_types(analyzer):
77
+ issues = []
78
+ for col in analyzer.df.columns:
79
+ types = analyzer.df[col].dropna().map(type).nunique()
80
+ if types > 1:
81
+ issues.append(
82
+ Issues(
83
+ category="mixed_types",
84
+ severity="warning",
85
+ column=col,
86
+ description=f"Column '{col}' contains mixed data types",
87
+ impact_score="low",
88
+ quick_fix="Options: \n- Cast to single type: Ensure consistency (Pros: Simplifies processing; Cons: May lose nuance).\n- Split column: Separate types into new features (Pros: Preserves info; Cons: Adds complexity).\n- Investigate source: Check data collection errors (Pros: Improves quality; Cons: Time-consuming).",
89
+ )
90
+ )
91
+ return issues
@@ -0,0 +1,10 @@
1
+ from dataclasses import dataclass
2
+
3
+ @dataclass
4
+ class Issues:
5
+ category: str
6
+ severity: str # critical or warning
7
+ column: str
8
+ description: str
9
+ impact_score: str # high, medium, low
10
+ quick_fix: str
@@ -0,0 +1,114 @@
1
+ from .core import Issues
2
+ import pandas as pd
3
+ from scipy.stats import chi2_contingency, f_oneway
4
+ import numpy as np
5
+
6
+ def _check_feature_correlation(analyzer, threshold: float = 0.95, critical_threshold: float = 0.98):
7
+ issues = []
8
+ numeric_df = analyzer.df.select_dtypes(include="number")
9
+ if numeric_df.empty:
10
+ return issues
11
+ corr_matrix = numeric_df.corr().abs()
12
+ upper = corr_matrix.where(np.tril(np.ones(corr_matrix.shape)).astype(bool))
13
+ correlated_pairs = [
14
+ (col, row, float(val))
15
+ for row in upper.index
16
+ for col, val in upper[row].dropna().items()
17
+ if val > threshold and col != row
18
+ ]
19
+ for col1, col2, corr in correlated_pairs:
20
+ severity = "critical" if corr > critical_threshold else "warning"
21
+ impact = "high" if severity == "critical" else "medium"
22
+ quick_fix = (
23
+ "Options: \n- Drop one feature: Reduces multicollinearity (Pros: Simplifies model; Cons: Loses info).\n- Combine features: Create composite feature (e.g., PCA) (Pros: Retains info; Cons: Less interpretable).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
24
+ if severity == "critical"
25
+ else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of multicollinearity).\n- Engineer feature: Combine or transform features (Pros: Reduces redundancy; Cons: Adds complexity)."
26
+ )
27
+ issues.append(
28
+ Issues(
29
+ category="feature_correlation",
30
+ severity=severity,
31
+ column=f"{col1},{col2}",
32
+ description=f"Columns '{col1}' and '{col2}' are highly correlated ({corr:.2f})",
33
+ impact_score=impact,
34
+ quick_fix=quick_fix,
35
+ )
36
+ )
37
+ return issues
38
+
39
+ def _check_categorical_correlation(analyzer, threshold: float = 0.8, critical_threshold: float = 0.95):
40
+ issues = []
41
+ categorical = analyzer.df.select_dtypes(include="object").columns.tolist()
42
+ for i, c1 in enumerate(categorical):
43
+ for c2 in categorical[i + 1 :]:
44
+ try:
45
+ table = pd.crosstab(analyzer.df[c1], analyzer.df[c2])
46
+ chi2, _, _, _ = chi2_contingency(table)
47
+ n = table.sum().sum()
48
+ phi2 = chi2 / n
49
+ r, k = table.shape
50
+ cramers_v = np.sqrt(phi2 / min(k - 1, r - 1))
51
+ if cramers_v > threshold:
52
+ severity = "critical" if cramers_v > critical_threshold else "warning"
53
+ impact = "high" if severity == "critical" else "medium"
54
+ quick_fix = (
55
+ "Options: \n- Drop one feature: Avoids overfitting from high redundancy (Pros: Simplifies model; Cons: Loses info).\n- Engineer feature: Extract common patterns (e.g., group categories) (Pros: Retains info; Cons: Requires domain knowledge).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
56
+ if severity == "critical"
57
+ else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of redundancy).\n- Engineer feature: Group categories or encode differently (Pros: Reduces redundancy; Cons: Adds complexity)."
58
+ )
59
+ issues.append(
60
+ Issues(
61
+ category="feature_correlation",
62
+ severity=severity,
63
+ column=f"{c1},{c2}",
64
+ description=f"Columns '{c1}' and '{c2}' are highly associated (Cramer's V: {float(cramers_v):.2f})",
65
+ impact_score=impact,
66
+ quick_fix=quick_fix,
67
+ )
68
+ )
69
+ except Exception:
70
+ continue
71
+ return issues
72
+
73
+ def _check_mixed_correlation(analyzer, p_threshold: float = 0.05, critical_p_threshold: float = 0.001):
74
+ issues = []
75
+ cat_cols = analyzer.df.select_dtypes(
76
+ include=["object", "category"]
77
+ ).columns.tolist()
78
+ num_cols = analyzer.df.select_dtypes(include=["int64", "float64"]).columns.tolist()
79
+ for cat in cat_cols:
80
+ for num in num_cols:
81
+ groups = [
82
+ analyzer.df.loc[analyzer.df[cat] == level, num].dropna().to_numpy()
83
+ for level in analyzer.df[cat].dropna().unique()
84
+ if len(analyzer.df.loc[analyzer.df[cat] == level, num].dropna()) > 1
85
+ ]
86
+ if len(groups) < 2 or all(np.var(g, ddof=1) == 0 for g in groups):
87
+ continue
88
+ try:
89
+ f_stat, p_val = f_oneway(*groups)
90
+ if p_val < p_threshold:
91
+ severity = (
92
+ "critical"
93
+ if p_val < critical_p_threshold and f_stat > 20.0
94
+ else "warning"
95
+ )
96
+ impact = "high" if severity == "critical" else "medium"
97
+ quick_fix = (
98
+ "Options: \n- Drop one feature: Avoids redundancy (Pros: Simplifies model; Cons: Loses info).\n- Engineer feature: Transform categorical or numeric feature (Pros: Retains info; Cons: Adds complexity).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
99
+ if severity == "critical"
100
+ else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of redundancy).\n- Engineer feature: Transform or encode differently (Pros: Reduces redundancy; Cons: Adds complexity)."
101
+ )
102
+ issues.append(
103
+ Issues(
104
+ category="feature_correlation",
105
+ severity=severity,
106
+ column=f"{cat},{num}",
107
+ description=f"Columns '{cat}' and '{num}' show strong association (F: {float(f_stat):.2f}, p: {float(p_val):.4f})",
108
+ impact_score=impact,
109
+ quick_fix=quick_fix,
110
+ )
111
+ )
112
+ except Exception:
113
+ continue
114
+ return issues
@@ -0,0 +1,18 @@
1
+ from .core import Issues
2
+
3
+ def _check_class_imbalance(analyzer, threshold: float = 0.9):
4
+ issues = []
5
+ if analyzer.target_col and analyzer.target_col in analyzer.df.columns:
6
+ counts = analyzer.df[analyzer.target_col].value_counts(normalize=True)
7
+ if counts.max() > threshold:
8
+ issues.append(
9
+ Issues(
10
+ category="class_imbalance",
11
+ severity="warning",
12
+ column=analyzer.target_col,
13
+ description=f"Target '{analyzer.target_col}' is imbalanced ({float(counts.max()):.1%} in one class)",
14
+ impact_score="medium",
15
+ quick_fix="Options: \n- Resample data: Use oversampling (e.g., SMOTE) or undersampling (Pros: Balances classes; Cons: May introduce bias or lose data).\n- Use class weights: Adjust model weights for imbalance (Pros: Simple; Cons: Model-dependent).\n- Stratified sampling: Ensure balanced splits in training (Pros: Improves evaluation; Cons: Requires careful implementation).",
16
+ )
17
+ )
18
+ return issues