ethiclean-aditya 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: ethiclean-aditya
3
+ Version: 0.1.0
4
+ Summary: A lightweight tool for cleaning data and checking demographic bias.
5
+ Author-email: Aditya OHS <1adityaneil@gmail.com>
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: pandas
9
+
10
+ # ethiclean
@@ -0,0 +1 @@
1
+ # ethiclean
@@ -0,0 +1,16 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "ethiclean-aditya" # Must be unique on PyPI!
7
+ version = "0.1.0"
8
+ authors = [
9
+ { name="Aditya OHS", email="1adityaneil@gmail.com" },
10
+ ]
11
+ description = "A lightweight tool for cleaning data and checking demographic bias."
12
+ readme = "README.md"
13
+ requires-python = ">=3.8"
14
+ dependencies = [
15
+ "pandas",
16
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,74 @@
1
+ import pandas as pd
2
+ import warnings
3
+
4
+
5
+ class BiasDetector:
6
+ """A class to detect potential bias in categorical columns of a DataFrame."""
7
+
8
+ def detect_class_imbalance(self, df, column_name):
9
+ """
10
+ Analyze the distribution of unique values in a column and detect class imbalance.
11
+
12
+ This method performs the following operations:
13
+ 1. Validates that the column exists in the DataFrame
14
+ 2. Calculates the count and percentage of each unique value
15
+ 3. Identifies if any single group dominates the dataset (>80%)
16
+ 4. Issues a warning if significant imbalance is detected
17
+
18
+ Args:
19
+ df (pd.DataFrame): The DataFrame containing the column to analyze.
20
+ column_name (str): The name of the column to check for bias.
21
+
22
+ Returns:
23
+ dict: A dictionary containing:
24
+ - 'distributions': dict of each unique value and its percentage
25
+ - 'imbalance_detected': bool indicating if any group exceeds 80%
26
+ - 'dominant_group': str or None, the group exceeding 80% (if any)
27
+ - 'dominant_percentage': float or None, the percentage of the dominant group
28
+ - 'total_records': int, the total number of records analyzed
29
+ """
30
+ # Validate that the specified column exists in the DataFrame
31
+ if column_name not in df.columns:
32
+ raise ValueError(f"Column '{column_name}' not found in DataFrame.")
33
+
34
+ # Get the total count of non-null values in the column
35
+ total_records = df[column_name].notna().sum()
36
+
37
+ if total_records == 0:
38
+ raise ValueError(f"Column '{column_name}' contains no non-null values.")
39
+
40
+ # Calculate the frequency of each unique value
41
+ value_counts = df[column_name].value_counts()
42
+
43
+ # Calculate the percentage distribution for each unique value
44
+ # This shows the proportion of the dataset each group represents
45
+ distributions = (value_counts / total_records * 100).round(2).to_dict()
46
+
47
+ # Identify if any single group exceeds 80% of the dataset
48
+ # This threshold indicates potential bias or class imbalance
49
+ imbalance_detected = False
50
+ dominant_group = None
51
+ dominant_percentage = None
52
+
53
+ for group, percentage in distributions.items():
54
+ if percentage > 80:
55
+ imbalance_detected = True
56
+ dominant_group = group
57
+ dominant_percentage = percentage
58
+ # Issue a warning about the detected imbalance
59
+ warnings.warn(
60
+ f"Potential bias detected in column '{column_name}': "
61
+ f"Group '{group}' represents {percentage}% of the dataset.",
62
+ UserWarning
63
+ )
64
+ break
65
+
66
+ # Return results as a structured dictionary
67
+ return {
68
+ 'distributions': distributions,
69
+ 'imbalance_detected': imbalance_detected,
70
+ 'dominant_group': dominant_group,
71
+ 'dominant_percentage': dominant_percentage,
72
+ 'total_records': total_records
73
+ }
74
+
@@ -0,0 +1,35 @@
1
+ import pandas as pd
2
+
3
+
4
+ class DataCleaner:
5
+ """A class to clean and standardize messy CSV data."""
6
+
7
+ def clean_csv(self, file_path):
8
+ """
9
+ Clean a CSV file by standardizing column headers and removing blank rows.
10
+
11
+ This method performs the following operations:
12
+ 1. Reads the CSV file into a DataFrame
13
+ 2. Converts all column headers to lowercase
14
+ 3. Replaces spaces in column names with underscores
15
+ 4. Removes any rows that are completely blank (all NaN values)
16
+
17
+ Args:
18
+ file_path (str): Path to the CSV file to be cleaned.
19
+
20
+ Returns:
21
+ pd.DataFrame: A cleaned DataFrame with standardized column names and no blank rows.
22
+ """
23
+ # Read the CSV file into a pandas DataFrame
24
+ df = pd.read_csv(file_path)
25
+
26
+ # Standardize column headers: convert to lowercase and replace spaces with underscores
27
+ # This ensures consistent column naming conventions across the dataset
28
+ df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
29
+
30
+ # Remove rows that are completely blank (all NaN values)
31
+ # This cleans up formatting issues or incomplete entries in the dataset
32
+ df = df.dropna(how='all')
33
+
34
+ return df
35
+
@@ -0,0 +1,107 @@
1
+ import pandas as pd
2
+ from ethiclean.cleaner import DataCleaner
3
+ from ethiclean.bias import BiasDetector
4
+
5
+
6
+ class ReportGenerator:
7
+ """A class to generate formatted health reports from cleaned data and bias analysis."""
8
+
9
+ def generate_report(self, original_df, cleaned_df, bias_results):
10
+ """
11
+ Generate and print a formatted dataset health report to the terminal.
12
+
13
+ This method creates a visually organized report showing:
14
+ 1. Report title and timestamp information
15
+ 2. Data cleaning statistics (rows removed, rows retained)
16
+ 3. Column header standardization summary
17
+ 4. Bias analysis results and warnings
18
+
19
+ Args:
20
+ original_df (pd.DataFrame): The original DataFrame before cleaning.
21
+ cleaned_df (pd.DataFrame): The cleaned DataFrame after processing.
22
+ bias_results (dict): Dictionary output from BiasDetector.detect_class_imbalance().
23
+ """
24
+ # Calculate cleaning statistics
25
+ original_rows = len(original_df)
26
+ cleaned_rows = len(cleaned_df)
27
+ rows_removed = original_rows - cleaned_rows
28
+
29
+ # Print the report header
30
+ print("\n" + "=" * 70)
31
+ print(" " * 15 + "DATASET HEALTH REPORT")
32
+ print("=" * 70 + "\n")
33
+
34
+ # Print data cleaning summary
35
+ print("📊 DATA CLEANING SUMMARY")
36
+ print("-" * 70)
37
+ print(f" Original rows: {original_rows:,}")
38
+ print(f" Cleaned rows: {cleaned_rows:,}")
39
+ print(f" Rows removed: {rows_removed:,}")
40
+ print(f" Removal rate: {(rows_removed / original_rows * 100):.2f}%")
41
+ print()
42
+
43
+ # Print column standardization information
44
+ print("🔧 COLUMN STANDARDIZATION")
45
+ print("-" * 70)
46
+ print(f" Total columns: {len(cleaned_df.columns)}")
47
+ print(f" Standardized columns:")
48
+ for col in cleaned_df.columns:
49
+ print(f" • {col}")
50
+ print()
51
+
52
+ # Print bias analysis results
53
+ print("⚠️ BIAS ANALYSIS RESULTS")
54
+ print("-" * 70)
55
+
56
+ if bias_results['imbalance_detected']:
57
+ # Print warning if imbalance is detected
58
+ print(f" ⛔ IMBALANCE DETECTED!")
59
+ print(f" Dominant group: {bias_results['dominant_group']}")
60
+ print(f" Dominance level: {bias_results['dominant_percentage']}%")
61
+ print(f" Status: REQUIRES ATTENTION")
62
+ else:
63
+ # Print all clear message if no imbalance
64
+ print(" ✅ No significant imbalance detected")
65
+ print(f" Status: ALL CLEAR")
66
+
67
+ print()
68
+ print(" Distribution Breakdown:")
69
+ for group, percentage in sorted(
70
+ bias_results['distributions'].items(),
71
+ key=lambda x: x[1],
72
+ reverse=True
73
+ ):
74
+ # Create a simple bar chart representation
75
+ bar_length = int(percentage / 5)
76
+ bar = "█" * bar_length
77
+ print(f" {group:20} {percentage:6.2f}% {bar}")
78
+ print()
79
+ print("=" * 70 + "\n")
80
+
81
+ def generate_report_from_csv(self, csv_file_path, column_to_analyze):
82
+ """
83
+ Convenience method that performs cleaning and bias detection in one call.
84
+
85
+ This method orchestrates the full workflow:
86
+ 1. Load the original CSV
87
+ 2. Clean the data using DataCleaner
88
+ 3. Detect bias using BiasDetector
89
+ 4. Generate and display the report
90
+
91
+ Args:
92
+ csv_file_path (str): Path to the CSV file to process.
93
+ column_to_analyze (str): Name of the column to check for bias.
94
+ """
95
+ # Load the original DataFrame before any processing
96
+ original_df = pd.read_csv(csv_file_path)
97
+
98
+ # Clean the data using DataCleaner
99
+ cleaner = DataCleaner()
100
+ cleaned_df = cleaner.clean_csv(csv_file_path)
101
+
102
+ # Analyze bias in the specified column using BiasDetector
103
+ detector = BiasDetector()
104
+ bias_results = detector.detect_class_imbalance(cleaned_df, column_to_analyze)
105
+
106
+ # Generate and display the formatted report
107
+ self.generate_report(original_df, cleaned_df, bias_results)
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: ethiclean-aditya
3
+ Version: 0.1.0
4
+ Summary: A lightweight tool for cleaning data and checking demographic bias.
5
+ Author-email: Aditya OHS <1adityaneil@gmail.com>
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: pandas
9
+
10
+ # ethiclean
@@ -0,0 +1,11 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/ethiclean/__init__.py
4
+ src/ethiclean/bias.py
5
+ src/ethiclean/cleaner.py
6
+ src/ethiclean/report.py
7
+ src/ethiclean_aditya.egg-info/PKG-INFO
8
+ src/ethiclean_aditya.egg-info/SOURCES.txt
9
+ src/ethiclean_aditya.egg-info/dependency_links.txt
10
+ src/ethiclean_aditya.egg-info/requires.txt
11
+ src/ethiclean_aditya.egg-info/top_level.txt