ethiclean-aditya 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ethiclean_aditya-0.1.0/PKG-INFO +10 -0
- ethiclean_aditya-0.1.0/README.md +1 -0
- ethiclean_aditya-0.1.0/pyproject.toml +16 -0
- ethiclean_aditya-0.1.0/setup.cfg +4 -0
- ethiclean_aditya-0.1.0/src/ethiclean/__init__.py +0 -0
- ethiclean_aditya-0.1.0/src/ethiclean/bias.py +74 -0
- ethiclean_aditya-0.1.0/src/ethiclean/cleaner.py +35 -0
- ethiclean_aditya-0.1.0/src/ethiclean/report.py +107 -0
- ethiclean_aditya-0.1.0/src/ethiclean_aditya.egg-info/PKG-INFO +10 -0
- ethiclean_aditya-0.1.0/src/ethiclean_aditya.egg-info/SOURCES.txt +11 -0
- ethiclean_aditya-0.1.0/src/ethiclean_aditya.egg-info/dependency_links.txt +1 -0
- ethiclean_aditya-0.1.0/src/ethiclean_aditya.egg-info/requires.txt +1 -0
- ethiclean_aditya-0.1.0/src/ethiclean_aditya.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ethiclean-aditya
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight tool for cleaning data and checking demographic bias.
|
|
5
|
+
Author-email: Aditya OHS <1adityaneil@gmail.com>
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: pandas
|
|
9
|
+
|
|
10
|
+
# ethiclean
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# ethiclean
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ethiclean-aditya" # Must be unique on PyPI!
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Aditya OHS", email="1adityaneil@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "A lightweight tool for cleaning data and checking demographic bias."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"pandas",
|
|
16
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import warnings
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BiasDetector:
|
|
6
|
+
"""A class to detect potential bias in categorical columns of a DataFrame."""
|
|
7
|
+
|
|
8
|
+
def detect_class_imbalance(self, df, column_name):
|
|
9
|
+
"""
|
|
10
|
+
Analyze the distribution of unique values in a column and detect class imbalance.
|
|
11
|
+
|
|
12
|
+
This method performs the following operations:
|
|
13
|
+
1. Validates that the column exists in the DataFrame
|
|
14
|
+
2. Calculates the count and percentage of each unique value
|
|
15
|
+
3. Identifies if any single group dominates the dataset (>80%)
|
|
16
|
+
4. Issues a warning if significant imbalance is detected
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
df (pd.DataFrame): The DataFrame containing the column to analyze.
|
|
20
|
+
column_name (str): The name of the column to check for bias.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
dict: A dictionary containing:
|
|
24
|
+
- 'distributions': dict of each unique value and its percentage
|
|
25
|
+
- 'imbalance_detected': bool indicating if any group exceeds 80%
|
|
26
|
+
- 'dominant_group': str or None, the group exceeding 80% (if any)
|
|
27
|
+
- 'dominant_percentage': float or None, the percentage of the dominant group
|
|
28
|
+
- 'total_records': int, the total number of records analyzed
|
|
29
|
+
"""
|
|
30
|
+
# Validate that the specified column exists in the DataFrame
|
|
31
|
+
if column_name not in df.columns:
|
|
32
|
+
raise ValueError(f"Column '{column_name}' not found in DataFrame.")
|
|
33
|
+
|
|
34
|
+
# Get the total count of non-null values in the column
|
|
35
|
+
total_records = df[column_name].notna().sum()
|
|
36
|
+
|
|
37
|
+
if total_records == 0:
|
|
38
|
+
raise ValueError(f"Column '{column_name}' contains no non-null values.")
|
|
39
|
+
|
|
40
|
+
# Calculate the frequency of each unique value
|
|
41
|
+
value_counts = df[column_name].value_counts()
|
|
42
|
+
|
|
43
|
+
# Calculate the percentage distribution for each unique value
|
|
44
|
+
# This shows the proportion of the dataset each group represents
|
|
45
|
+
distributions = (value_counts / total_records * 100).round(2).to_dict()
|
|
46
|
+
|
|
47
|
+
# Identify if any single group exceeds 80% of the dataset
|
|
48
|
+
# This threshold indicates potential bias or class imbalance
|
|
49
|
+
imbalance_detected = False
|
|
50
|
+
dominant_group = None
|
|
51
|
+
dominant_percentage = None
|
|
52
|
+
|
|
53
|
+
for group, percentage in distributions.items():
|
|
54
|
+
if percentage > 80:
|
|
55
|
+
imbalance_detected = True
|
|
56
|
+
dominant_group = group
|
|
57
|
+
dominant_percentage = percentage
|
|
58
|
+
# Issue a warning about the detected imbalance
|
|
59
|
+
warnings.warn(
|
|
60
|
+
f"Potential bias detected in column '{column_name}': "
|
|
61
|
+
f"Group '{group}' represents {percentage}% of the dataset.",
|
|
62
|
+
UserWarning
|
|
63
|
+
)
|
|
64
|
+
break
|
|
65
|
+
|
|
66
|
+
# Return results as a structured dictionary
|
|
67
|
+
return {
|
|
68
|
+
'distributions': distributions,
|
|
69
|
+
'imbalance_detected': imbalance_detected,
|
|
70
|
+
'dominant_group': dominant_group,
|
|
71
|
+
'dominant_percentage': dominant_percentage,
|
|
72
|
+
'total_records': total_records
|
|
73
|
+
}
|
|
74
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DataCleaner:
|
|
5
|
+
"""A class to clean and standardize messy CSV data."""
|
|
6
|
+
|
|
7
|
+
def clean_csv(self, file_path):
|
|
8
|
+
"""
|
|
9
|
+
Clean a CSV file by standardizing column headers and removing blank rows.
|
|
10
|
+
|
|
11
|
+
This method performs the following operations:
|
|
12
|
+
1. Reads the CSV file into a DataFrame
|
|
13
|
+
2. Converts all column headers to lowercase
|
|
14
|
+
3. Replaces spaces in column names with underscores
|
|
15
|
+
4. Removes any rows that are completely blank (all NaN values)
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
file_path (str): Path to the CSV file to be cleaned.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
pd.DataFrame: A cleaned DataFrame with standardized column names and no blank rows.
|
|
22
|
+
"""
|
|
23
|
+
# Read the CSV file into a pandas DataFrame
|
|
24
|
+
df = pd.read_csv(file_path)
|
|
25
|
+
|
|
26
|
+
# Standardize column headers: convert to lowercase and replace spaces with underscores
|
|
27
|
+
# This ensures consistent column naming conventions across the dataset
|
|
28
|
+
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
|
|
29
|
+
|
|
30
|
+
# Remove rows that are completely blank (all NaN values)
|
|
31
|
+
# This cleans up formatting issues or incomplete entries in the dataset
|
|
32
|
+
df = df.dropna(how='all')
|
|
33
|
+
|
|
34
|
+
return df
|
|
35
|
+
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from ethiclean.cleaner import DataCleaner
|
|
3
|
+
from ethiclean.bias import BiasDetector
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ReportGenerator:
|
|
7
|
+
"""A class to generate formatted health reports from cleaned data and bias analysis."""
|
|
8
|
+
|
|
9
|
+
def generate_report(self, original_df, cleaned_df, bias_results):
|
|
10
|
+
"""
|
|
11
|
+
Generate and print a formatted dataset health report to the terminal.
|
|
12
|
+
|
|
13
|
+
This method creates a visually organized report showing:
|
|
14
|
+
1. Report title and timestamp information
|
|
15
|
+
2. Data cleaning statistics (rows removed, rows retained)
|
|
16
|
+
3. Column header standardization summary
|
|
17
|
+
4. Bias analysis results and warnings
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
original_df (pd.DataFrame): The original DataFrame before cleaning.
|
|
21
|
+
cleaned_df (pd.DataFrame): The cleaned DataFrame after processing.
|
|
22
|
+
bias_results (dict): Dictionary output from BiasDetector.detect_class_imbalance().
|
|
23
|
+
"""
|
|
24
|
+
# Calculate cleaning statistics
|
|
25
|
+
original_rows = len(original_df)
|
|
26
|
+
cleaned_rows = len(cleaned_df)
|
|
27
|
+
rows_removed = original_rows - cleaned_rows
|
|
28
|
+
|
|
29
|
+
# Print the report header
|
|
30
|
+
print("\n" + "=" * 70)
|
|
31
|
+
print(" " * 15 + "DATASET HEALTH REPORT")
|
|
32
|
+
print("=" * 70 + "\n")
|
|
33
|
+
|
|
34
|
+
# Print data cleaning summary
|
|
35
|
+
print("📊 DATA CLEANING SUMMARY")
|
|
36
|
+
print("-" * 70)
|
|
37
|
+
print(f" Original rows: {original_rows:,}")
|
|
38
|
+
print(f" Cleaned rows: {cleaned_rows:,}")
|
|
39
|
+
print(f" Rows removed: {rows_removed:,}")
|
|
40
|
+
print(f" Removal rate: {(rows_removed / original_rows * 100):.2f}%")
|
|
41
|
+
print()
|
|
42
|
+
|
|
43
|
+
# Print column standardization information
|
|
44
|
+
print("🔧 COLUMN STANDARDIZATION")
|
|
45
|
+
print("-" * 70)
|
|
46
|
+
print(f" Total columns: {len(cleaned_df.columns)}")
|
|
47
|
+
print(f" Standardized columns:")
|
|
48
|
+
for col in cleaned_df.columns:
|
|
49
|
+
print(f" • {col}")
|
|
50
|
+
print()
|
|
51
|
+
|
|
52
|
+
# Print bias analysis results
|
|
53
|
+
print("⚠️ BIAS ANALYSIS RESULTS")
|
|
54
|
+
print("-" * 70)
|
|
55
|
+
|
|
56
|
+
if bias_results['imbalance_detected']:
|
|
57
|
+
# Print warning if imbalance is detected
|
|
58
|
+
print(f" ⛔ IMBALANCE DETECTED!")
|
|
59
|
+
print(f" Dominant group: {bias_results['dominant_group']}")
|
|
60
|
+
print(f" Dominance level: {bias_results['dominant_percentage']}%")
|
|
61
|
+
print(f" Status: REQUIRES ATTENTION")
|
|
62
|
+
else:
|
|
63
|
+
# Print all clear message if no imbalance
|
|
64
|
+
print(" ✅ No significant imbalance detected")
|
|
65
|
+
print(f" Status: ALL CLEAR")
|
|
66
|
+
|
|
67
|
+
print()
|
|
68
|
+
print(" Distribution Breakdown:")
|
|
69
|
+
for group, percentage in sorted(
|
|
70
|
+
bias_results['distributions'].items(),
|
|
71
|
+
key=lambda x: x[1],
|
|
72
|
+
reverse=True
|
|
73
|
+
):
|
|
74
|
+
# Create a simple bar chart representation
|
|
75
|
+
bar_length = int(percentage / 5)
|
|
76
|
+
bar = "█" * bar_length
|
|
77
|
+
print(f" {group:20} {percentage:6.2f}% {bar}")
|
|
78
|
+
print()
|
|
79
|
+
print("=" * 70 + "\n")
|
|
80
|
+
|
|
81
|
+
def generate_report_from_csv(self, csv_file_path, column_to_analyze):
|
|
82
|
+
"""
|
|
83
|
+
Convenience method that performs cleaning and bias detection in one call.
|
|
84
|
+
|
|
85
|
+
This method orchestrates the full workflow:
|
|
86
|
+
1. Load the original CSV
|
|
87
|
+
2. Clean the data using DataCleaner
|
|
88
|
+
3. Detect bias using BiasDetector
|
|
89
|
+
4. Generate and display the report
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
csv_file_path (str): Path to the CSV file to process.
|
|
93
|
+
column_to_analyze (str): Name of the column to check for bias.
|
|
94
|
+
"""
|
|
95
|
+
# Load the original DataFrame before any processing
|
|
96
|
+
original_df = pd.read_csv(csv_file_path)
|
|
97
|
+
|
|
98
|
+
# Clean the data using DataCleaner
|
|
99
|
+
cleaner = DataCleaner()
|
|
100
|
+
cleaned_df = cleaner.clean_csv(csv_file_path)
|
|
101
|
+
|
|
102
|
+
# Analyze bias in the specified column using BiasDetector
|
|
103
|
+
detector = BiasDetector()
|
|
104
|
+
bias_results = detector.detect_class_imbalance(cleaned_df, column_to_analyze)
|
|
105
|
+
|
|
106
|
+
# Generate and display the formatted report
|
|
107
|
+
self.generate_report(original_df, cleaned_df, bias_results)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ethiclean-aditya
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight tool for cleaning data and checking demographic bias.
|
|
5
|
+
Author-email: Aditya OHS <1adityaneil@gmail.com>
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: pandas
|
|
9
|
+
|
|
10
|
+
# ethiclean
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/ethiclean/__init__.py
|
|
4
|
+
src/ethiclean/bias.py
|
|
5
|
+
src/ethiclean/cleaner.py
|
|
6
|
+
src/ethiclean/report.py
|
|
7
|
+
src/ethiclean_aditya.egg-info/PKG-INFO
|
|
8
|
+
src/ethiclean_aditya.egg-info/SOURCES.txt
|
|
9
|
+
src/ethiclean_aditya.egg-info/dependency_links.txt
|
|
10
|
+
src/ethiclean_aditya.egg-info/requires.txt
|
|
11
|
+
src/ethiclean_aditya.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pandas
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ethiclean
|