autooutlier 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autooutlier/__init__.py +26 -0
- autooutlier/detection.py +67 -0
- autooutlier/handling.py +173 -0
- autooutlier/statistics.py +71 -0
- autooutlier/summary.py +25 -0
- autooutlier/utils.py +21 -0
- autooutlier/version.py +4 -0
- autooutlier-0.1.0.dist-info/METADATA +202 -0
- autooutlier-0.1.0.dist-info/RECORD +12 -0
- autooutlier-0.1.0.dist-info/WHEEL +5 -0
- autooutlier-0.1.0.dist-info/licenses/LICENSE +21 -0
- autooutlier-0.1.0.dist-info/top_level.txt +1 -0
autooutlier/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
autooutlier - Automatic Outlier Detection and Handling for Python
|
|
4
|
+
=================================================================
|
|
5
|
+
|
|
6
|
+
A professional Python package for automatic outlier detection,
|
|
7
|
+
handling, and statistical analysis of numerical data.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
>>> import autooutlier
|
|
11
|
+
>>> from autooutlier import handle_outliers, detect_outliers
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .handling import handle_outliers
|
|
15
|
+
from .summary import before_cleaning_summary
|
|
16
|
+
from .detection import detect_outliers
|
|
17
|
+
from .detection import detect_outlier_method
|
|
18
|
+
from .version import __version__
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"handle_outliers",
|
|
22
|
+
"before_cleaning_summary",
|
|
23
|
+
"detect_outliers",
|
|
24
|
+
"detect_outlier_method",
|
|
25
|
+
"__version__",
|
|
26
|
+
]
|
autooutlier/detection.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Outlier detection methods."""
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from .statistics import mean, std, q1, q3, iqr, skew_measurment
|
|
7
|
+
from .utils import is_numeric
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def Iqr_method(data,column):
|
|
11
|
+
q1_value=q1(data[column])
|
|
12
|
+
q3_value=q3(data[column])
|
|
13
|
+
IQR=iqr(data[column])
|
|
14
|
+
LowerFence=q1_value-1.5*IQR
|
|
15
|
+
upperFence=q3_value+1.5*IQR
|
|
16
|
+
outliers=(data[column]>upperFence) | (data[column]<LowerFence)
|
|
17
|
+
return outliers
|
|
18
|
+
|
|
19
|
+
def z_score_method(data,column):
|
|
20
|
+
mean_value=mean(data[column])
|
|
21
|
+
std_value=std(data[column])
|
|
22
|
+
if std_value==0:
|
|
23
|
+
return np.zeros(len(data),dtype=bool)
|
|
24
|
+
z_score=(data[column]-mean_value)/std_value
|
|
25
|
+
outliers=abs(z_score)>3
|
|
26
|
+
return outliers
|
|
27
|
+
|
|
28
|
+
def modified_z_score(data,column):
|
|
29
|
+
median_value=data[column].median()
|
|
30
|
+
absolute_value=abs(data[column]-median_value)
|
|
31
|
+
MAD=absolute_value.median()
|
|
32
|
+
if MAD==0:
|
|
33
|
+
return np.zeros(len(data),dtype=bool)
|
|
34
|
+
modified_z_score=0.6745*(data[column]-median_value)/MAD
|
|
35
|
+
outliers=abs(modified_z_score)>3.5
|
|
36
|
+
return outliers
|
|
37
|
+
|
|
38
|
+
def percentile_method(data,column):
|
|
39
|
+
lower_limit=np.percentile(data[column],5)
|
|
40
|
+
upper_limit=np.percentile(data[column],95)
|
|
41
|
+
outliers=(data[column]>upper_limit) | (data[column]<lower_limit)
|
|
42
|
+
return outliers
|
|
43
|
+
|
|
44
|
+
def detect_outlier_method(data,column):
|
|
45
|
+
numeric=is_numeric(data,column)
|
|
46
|
+
if not numeric :
|
|
47
|
+
return "It is not a numerical Column"
|
|
48
|
+
distribution=skew_measurment(data[column])
|
|
49
|
+
if distribution in ['Perfectly Symmetric','Approximately Symmetric']:
|
|
50
|
+
detection_method='z_score'
|
|
51
|
+
elif distribution in ['Highly Right Skewed','Highly Left Skewed','Moderately Right Skewed','Moderately Left Skewed']:
|
|
52
|
+
detection_method='modified_z_score'
|
|
53
|
+
else:
|
|
54
|
+
detection_method='Iqr_method'
|
|
55
|
+
return detection_method
|
|
56
|
+
|
|
57
|
+
def detect_outliers(data,column):
|
|
58
|
+
method=detect_outlier_method(data,column)
|
|
59
|
+
|
|
60
|
+
if method == 'z_score':
|
|
61
|
+
return z_score_method(data,column)
|
|
62
|
+
|
|
63
|
+
elif method == 'modified_z_score':
|
|
64
|
+
return modified_z_score(data,column)
|
|
65
|
+
|
|
66
|
+
else:
|
|
67
|
+
return Iqr_method(data,column)
|
autooutlier/handling.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Outlier handling and replacement methods."""
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from .statistics import mean, median, mode, q1, q3, iqr, skew, skew_measurment
|
|
8
|
+
from .utils import is_numeric, is_time_series, is_continous, outlier_count, outlier_percentage
|
|
9
|
+
from .detection import (
|
|
10
|
+
Iqr_method,
|
|
11
|
+
z_score_method,
|
|
12
|
+
modified_z_score,
|
|
13
|
+
percentile_method,
|
|
14
|
+
detect_outlier_method,
|
|
15
|
+
detect_outliers,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def winsorization(data,column):
|
|
20
|
+
data[column]=data[column].astype(float)
|
|
21
|
+
q1_value=q1(data[column])
|
|
22
|
+
q3_value=q3(data[column])
|
|
23
|
+
IQR=iqr(data[column])
|
|
24
|
+
LowerFence=q1_value-1.5*IQR
|
|
25
|
+
upperFence=q3_value+1.5*IQR
|
|
26
|
+
data.loc[data[column]>upperFence,column]=upperFence
|
|
27
|
+
data.loc[data[column]<LowerFence,column]=LowerFence
|
|
28
|
+
return data
|
|
29
|
+
|
|
30
|
+
def interpolate(data,column,outliers):
|
|
31
|
+
data.loc[outliers,column]=np.nan
|
|
32
|
+
data[column]=data[column].interpolate()
|
|
33
|
+
return data
|
|
34
|
+
|
|
35
|
+
def replace_with_median(data,column,outliers):
|
|
36
|
+
data[column]=data[column].astype(float)
|
|
37
|
+
median_value=median(data[column])
|
|
38
|
+
data.loc[outliers,column]=median_value
|
|
39
|
+
return data
|
|
40
|
+
|
|
41
|
+
def replace_with_mean(data,column,outliers):
|
|
42
|
+
data[column]=data[column].astype(float)
|
|
43
|
+
mean_value=mean(data[column])
|
|
44
|
+
data.loc[outliers,column]=mean_value
|
|
45
|
+
return data
|
|
46
|
+
|
|
47
|
+
def replace_with_mode(data,column,outliers):
|
|
48
|
+
data[column]=data[column].astype(float)
|
|
49
|
+
mode_value=mode(data[column]).iloc[0]
|
|
50
|
+
data.loc[outliers,column]=mode_value
|
|
51
|
+
return data
|
|
52
|
+
|
|
53
|
+
def remove_outliers(data,column,outliers):
|
|
54
|
+
data=data[~outliers].copy()
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
def replace_with_custom_value(data,column,outliers,value):
|
|
58
|
+
data.loc[outliers,column]=value
|
|
59
|
+
return data
|
|
60
|
+
|
|
61
|
+
def replace_with_backward_fill(data,column,outliers):
|
|
62
|
+
data.loc[outliers,column]=np.nan
|
|
63
|
+
data[column]=data[column].bfill().ffill()
|
|
64
|
+
return data
|
|
65
|
+
|
|
66
|
+
def replace_with_forward_fill(data,column,outliers):
|
|
67
|
+
data.loc[outliers,column]=np.nan
|
|
68
|
+
data[column]=data[column].ffill().bfill()
|
|
69
|
+
return data
|
|
70
|
+
|
|
71
|
+
def is_binary(data, column):
|
|
72
|
+
return data[column].nunique() == 2
|
|
73
|
+
|
|
74
|
+
def detect_handler(data,column):
|
|
75
|
+
numeric=is_numeric(data,column)
|
|
76
|
+
outliers=detect_outliers(data,column)
|
|
77
|
+
percentage=outlier_percentage(outliers)
|
|
78
|
+
series=is_time_series(data,column)
|
|
79
|
+
continous=is_continous(data,column)
|
|
80
|
+
distribution=skew_measurment(data[column])
|
|
81
|
+
if not numeric :
|
|
82
|
+
return "It is not a numerical Column"
|
|
83
|
+
if series==True:
|
|
84
|
+
handling_method='interpolate'
|
|
85
|
+
elif continous==True:
|
|
86
|
+
handling_method='interpolate'
|
|
87
|
+
elif percentage<=5:
|
|
88
|
+
handling_method='winsorization'
|
|
89
|
+
elif distribution in ['Highly Right Skewed','Highly Left Skewed','Moderately Right Skewed','Moderately Left Skewed']:
|
|
90
|
+
handling_method='median'
|
|
91
|
+
elif distribution in ['Perfectly Symmetric','Approximately Symmetric']:
|
|
92
|
+
handling_method='mean'
|
|
93
|
+
else:
|
|
94
|
+
handling_method='median'
|
|
95
|
+
return handling_method
|
|
96
|
+
|
|
97
|
+
def handle_outliers(data,column,detection_method='auto',replacement='auto',value=None):
|
|
98
|
+
numeric_check=is_numeric(data,column)
|
|
99
|
+
if numeric_check==False:
|
|
100
|
+
return "It is Not a Numerical Data"
|
|
101
|
+
if is_binary(data, column):
|
|
102
|
+
print( "Binary columns are not suitable for outlier detection")
|
|
103
|
+
if(detection_method=='auto'):
|
|
104
|
+
detection_method=detect_outlier_method(data,column)
|
|
105
|
+
if detection_method == 'z_score':
|
|
106
|
+
outlier = z_score_method(data,column)
|
|
107
|
+
|
|
108
|
+
elif detection_method == 'modified_z_score':
|
|
109
|
+
outlier = modified_z_score(data,column)
|
|
110
|
+
|
|
111
|
+
elif detection_method == 'Iqr_method':
|
|
112
|
+
outlier = Iqr_method(data,column)
|
|
113
|
+
|
|
114
|
+
elif detection_method == 'percentile':
|
|
115
|
+
outlier = percentile_method(data,column)
|
|
116
|
+
|
|
117
|
+
else:
|
|
118
|
+
print( "Invalid Detection Method")
|
|
119
|
+
elif detection_method=='Iqr_method':
|
|
120
|
+
outlier= Iqr_method(data,column)
|
|
121
|
+
elif detection_method=='z_score':
|
|
122
|
+
outlier= z_score_method(data,column)
|
|
123
|
+
elif detection_method=='modified_z_score':
|
|
124
|
+
outlier=modified_z_score(data,column)
|
|
125
|
+
elif detection_method=='percentile':
|
|
126
|
+
outlier=percentile_method(data,column)
|
|
127
|
+
else:
|
|
128
|
+
print( "Invalid Detection Method")
|
|
129
|
+
if (replacement=='auto'):
|
|
130
|
+
replacement=detect_handler(data,column)
|
|
131
|
+
if replacement=='interpolate':
|
|
132
|
+
data= interpolate(data,column,outlier)
|
|
133
|
+
elif replacement=='winsorization':
|
|
134
|
+
data= winsorization(data,column)
|
|
135
|
+
elif replacement=='median':
|
|
136
|
+
data= replace_with_median(data,column,outlier)
|
|
137
|
+
elif replacement=='mode':
|
|
138
|
+
data=replace_with_mode(data,column,outlier)
|
|
139
|
+
elif replacement=='mean':
|
|
140
|
+
data= replace_with_mean(data,column,outlier)
|
|
141
|
+
if replacement=='interpolate':
|
|
142
|
+
data= interpolate(data,column,outlier)
|
|
143
|
+
elif replacement=='winsorization':
|
|
144
|
+
data= winsorization(data,column)
|
|
145
|
+
elif replacement=='median':
|
|
146
|
+
data= replace_with_median(data,column,outlier)
|
|
147
|
+
elif replacement=='mode':
|
|
148
|
+
data= replace_with_mode(data,column,outlier)
|
|
149
|
+
elif replacement=='mean':
|
|
150
|
+
data= replace_with_mean(data,column,outlier)
|
|
151
|
+
elif replacement=='custom':
|
|
152
|
+
if value is None:
|
|
153
|
+
print( "Please Provide Custom Value")
|
|
154
|
+
data=replace_with_custom_value(data,column,outlier,value)
|
|
155
|
+
elif replacement=='remove':
|
|
156
|
+
data=remove_outliers(data,column,outlier)
|
|
157
|
+
elif replacement=='bfill':
|
|
158
|
+
data= replace_with_backward_fill(data,column,outlier)
|
|
159
|
+
elif replacement=='ffill':
|
|
160
|
+
data= replace_with_forward_fill(data,column,outlier)
|
|
161
|
+
else:
|
|
162
|
+
print("Invalid Replacement Method")
|
|
163
|
+
outlier = detect_outliers(data, column)
|
|
164
|
+
after_cleaning_report = {
|
|
165
|
+
"Column": [column],
|
|
166
|
+
"Detection Method": [detection_method],
|
|
167
|
+
"Handling Method": [replacement],
|
|
168
|
+
"Outlier Count": [outlier_count(outlier)],
|
|
169
|
+
"Outlier Percentage": [outlier_percentage(outlier)],
|
|
170
|
+
"Skewness": [skew(data[column])],
|
|
171
|
+
"Distribution": [skew_measurment(data[column])]
|
|
172
|
+
}
|
|
173
|
+
return data,pd.DataFrame(after_cleaning_report)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Statistical functions for outlier detection."""
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from scipy.stats import skew as scipy_skew
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def mean(data):
|
|
9
|
+
return data.mean()
|
|
10
|
+
|
|
11
|
+
def median(data):
|
|
12
|
+
return data.median()
|
|
13
|
+
|
|
14
|
+
def mode(data):
|
|
15
|
+
return data.mode()
|
|
16
|
+
|
|
17
|
+
def std(data):
|
|
18
|
+
return data.std()
|
|
19
|
+
|
|
20
|
+
def var(data):
|
|
21
|
+
return data.var()
|
|
22
|
+
|
|
23
|
+
def data_range(data):
|
|
24
|
+
return data.max()-data.min()
|
|
25
|
+
|
|
26
|
+
def q1(data):
|
|
27
|
+
a= np.percentile(data,25)
|
|
28
|
+
return a
|
|
29
|
+
|
|
30
|
+
def q3(data):
|
|
31
|
+
b= np.percentile(data,75)
|
|
32
|
+
return b
|
|
33
|
+
|
|
34
|
+
def iqr(data):
|
|
35
|
+
return q3(data)-q1(data)
|
|
36
|
+
|
|
37
|
+
def skew(data):
|
|
38
|
+
return scipy_skew(data,bias=False)
|
|
39
|
+
|
|
40
|
+
def skew_measurment(data):
|
|
41
|
+
s=skew(data)
|
|
42
|
+
if s==0:
|
|
43
|
+
return "Perfectly Symmetric"
|
|
44
|
+
if s >= -0.5and s <= 0.5:
|
|
45
|
+
return "Approximately Symmetric"
|
|
46
|
+
if s >= 0.5 and s<= 1:
|
|
47
|
+
return "Moderately Right Skewed"
|
|
48
|
+
if s>1:
|
|
49
|
+
return "Highly Right Skewed"
|
|
50
|
+
if s<-1:
|
|
51
|
+
return "Highly Left Skewed"
|
|
52
|
+
if s>=-1 and s<=-0.5:
|
|
53
|
+
return "Moderately Left Skewed"
|
|
54
|
+
|
|
55
|
+
def is_normal(data):
|
|
56
|
+
s=skew(data)
|
|
57
|
+
if (abs(s)<0.5):
|
|
58
|
+
return True
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
def kurtosis(data):
|
|
62
|
+
return data.kurt()
|
|
63
|
+
|
|
64
|
+
def kurtosis_measurement(data):
|
|
65
|
+
k=kurtosis(data)
|
|
66
|
+
if k==0:
|
|
67
|
+
return "Normal Distribution"
|
|
68
|
+
if k>0:
|
|
69
|
+
return "Heavier Tail"
|
|
70
|
+
if k<0:
|
|
71
|
+
return "Lighter Tail"
|
autooutlier/summary.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Summary report generation for outlier analysis."""
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from .statistics import skew, skew_measurment
|
|
7
|
+
from .utils import outlier_count, outlier_percentage
|
|
8
|
+
from .detection import detect_outliers, detect_outlier_method
|
|
9
|
+
from .handling import detect_handler
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def before_cleaning_summary(data, column):
|
|
13
|
+
outliers = detect_outliers(data, column)
|
|
14
|
+
|
|
15
|
+
report = {
|
|
16
|
+
"Column": [column],
|
|
17
|
+
"Suggested_Detection Method": [detect_outlier_method(data,column)],
|
|
18
|
+
"Handling Method": [detect_handler(data,column)],
|
|
19
|
+
"Skewness": [skew(data[column])],
|
|
20
|
+
"Distribution": [skew_measurment(data[column])],
|
|
21
|
+
"Outlier Count": [outlier_count(outliers)],
|
|
22
|
+
"Outlier Percentage": [outlier_percentage(outliers)]
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
return pd.DataFrame(report)
|
autooutlier/utils.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Utility functions for outlier detection."""
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def is_numeric(data,column):
|
|
8
|
+
return pd.api.types.is_numeric_dtype(data[column])
|
|
9
|
+
|
|
10
|
+
def is_time_series(data,column):
|
|
11
|
+
return pd.api.types.is_datetime64_any_dtype(data[column])
|
|
12
|
+
|
|
13
|
+
def is_continous(data,column):
|
|
14
|
+
numeric=is_numeric(data,column)
|
|
15
|
+
return ( numeric==True and (data[column].is_monotonic_increasing or data[column].is_monotonic_decreasing))
|
|
16
|
+
|
|
17
|
+
def outlier_count(outliers):
|
|
18
|
+
return outliers.sum()
|
|
19
|
+
|
|
20
|
+
def outlier_percentage(outliers):
|
|
21
|
+
return (outliers.sum()/len(outliers))*100
|
autooutlier/version.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: autooutlier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automatic outlier detection and handling for Python.
|
|
5
|
+
Author: Suruthika C D
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/suruthika-cd/autooutlier
|
|
8
|
+
Project-URL: Repository, https://github.com/suruthika-cd/autooutlier
|
|
9
|
+
Project-URL: Documentation, https://github.com/suruthika-cd/autooutlier/tree/main/docs
|
|
10
|
+
Project-URL: Issues, https://github.com/suruthika-cd/autooutlier/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/suruthika-cd/autooutlier/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: outlier,outlier-detection,statistics,data-cleaning,data-science,machine-learning,preprocessing,pandas
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
|
+
Requires-Python: >=3.8
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Dist: numpy>=1.21.0
|
|
31
|
+
Requires-Dist: pandas>=1.3.0
|
|
32
|
+
Requires-Dist: scipy>=1.7.0
|
|
33
|
+
Requires-Dist: matplotlib>=3.4.0
|
|
34
|
+
Requires-Dist: seaborn>=0.11.0
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# autooutlier
|
|
38
|
+
|
|
39
|
+
**Automatic Outlier Detection and Handling for Python**
|
|
40
|
+
|
|
41
|
+
[](https://www.python.org/downloads/)
|
|
42
|
+
[](https://opensource.org/licenses/MIT)
|
|
43
|
+
[](https://pypi.org/project/autooutlier/)
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Overview
|
|
48
|
+
|
|
49
|
+
`autooutlier` is a Python package that **automatically detects, analyzes, and handles outliers** in numerical data. It intelligently selects the best detection and handling methods based on data distribution — requiring **zero configuration** from the user.
|
|
50
|
+
|
|
51
|
+
Simply pass your DataFrame and column name, and `autooutlier` handles the rest.
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Features
|
|
56
|
+
|
|
57
|
+
- **Automatic Detection** — Selects the optimal outlier detection method (Z-Score, Modified Z-Score, IQR, Percentile) based on data skewness.
|
|
58
|
+
- **Automatic Handling** — Chooses the best outlier replacement strategy (winsorization, mean/median/mode replacement, interpolation, etc.) based on data characteristics.
|
|
59
|
+
- **Statistical Analysis** — Provides mean, median, mode, standard deviation, variance, skewness, kurtosis, and distribution classification.
|
|
60
|
+
- **Pre-Cleaning Summary** — Generates a comprehensive report before cleaning, including detection method, handling strategy, outlier count, and percentage.
|
|
61
|
+
- **Post-Cleaning Report** — Returns both the cleaned dataset and an after-cleaning summary report.
|
|
62
|
+
- **Flexible Manual Control** — Override automatic selections with manual detection and handling methods when needed.
|
|
63
|
+
- **Visualization** — Built-in box plot support via Seaborn.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install autooutlier
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or install from source:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
git clone https://github.com/suruthika-cd/autooutlier.git
|
|
77
|
+
cd autooutlier
|
|
78
|
+
pip install -e .
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Dependencies
|
|
82
|
+
|
|
83
|
+
- Python >= 3.8
|
|
84
|
+
- NumPy >= 1.21.0
|
|
85
|
+
- Pandas >= 1.3.0
|
|
86
|
+
- SciPy >= 1.7.0
|
|
87
|
+
- Seaborn >= 0.11.0
|
|
88
|
+
- Matplotlib >= 3.4.0
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Quick Start
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
import pandas as pd
|
|
96
|
+
from autooutlier import handle_outliers, before_cleaning_summary, detect_outliers
|
|
97
|
+
|
|
98
|
+
# Load your data
|
|
99
|
+
df = pd.DataFrame({"values": [10, 12, 14, 11, 13, 100, 15, 12, 14, 11]})
|
|
100
|
+
|
|
101
|
+
# Get a pre-cleaning summary report
|
|
102
|
+
summary = before_cleaning_summary(df, "values")
|
|
103
|
+
print(summary)
|
|
104
|
+
|
|
105
|
+
# Automatically detect and handle outliers
|
|
106
|
+
cleaned_data, report = handle_outliers(df, "values")
|
|
107
|
+
print(report)
|
|
108
|
+
print(cleaned_data)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Usage Examples
|
|
114
|
+
|
|
115
|
+
### Automatic Outlier Detection
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from autooutlier import detect_outliers
|
|
119
|
+
|
|
120
|
+
outlier_mask = detect_outliers(df, "column_name")
|
|
121
|
+
print(f"Outliers found: {outlier_mask.sum()}")
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Automatic Outlier Handling
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from autooutlier import handle_outliers
|
|
128
|
+
|
|
129
|
+
# Fully automatic — detection and handling methods are chosen for you
|
|
130
|
+
cleaned_df, report = handle_outliers(df, "column_name")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Manual Detection Method
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
# Use a specific detection method
|
|
137
|
+
cleaned_df, report = handle_outliers(df, "column_name", detection_method="z_score")
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Available detection methods: `'auto'`, `'Iqr_method'`, `'z_score'`, `'modified_z_score'`, `'percentile'`
|
|
141
|
+
|
|
142
|
+
### Manual Handling Method
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
# Use a specific replacement strategy
|
|
146
|
+
cleaned_df, report = handle_outliers(df, "column_name", replacement="median")
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Available replacement methods: `'auto'`, `'interpolate'`, `'winsorization'`, `'median'`, `'mean'`, `'mode'`, `'custom'`, `'remove'`, `'bfill'`, `'ffill'`
|
|
150
|
+
|
|
151
|
+
### Custom Value Replacement
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
cleaned_df, report = handle_outliers(df, "column_name", replacement="custom", value=0)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Pre-Cleaning Summary
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from autooutlier import before_cleaning_summary
|
|
161
|
+
|
|
162
|
+
summary = before_cleaning_summary(df, "column_name")
|
|
163
|
+
print(summary)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Output includes: suggested detection method, handling method, skewness, distribution type, outlier count, and outlier percentage.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## API Overview
|
|
171
|
+
|
|
172
|
+
### Public API
|
|
173
|
+
|
|
174
|
+
| Function | Description |
|
|
175
|
+
|---|---|
|
|
176
|
+
| `handle_outliers(data, column, detection_method='auto', replacement='auto', value=None)` | Detect and handle outliers. Returns `(cleaned_data, report)`. |
|
|
177
|
+
| `detect_outliers(data, column)` | Detect outliers automatically. Returns a boolean mask. |
|
|
178
|
+
| `detect_outlier_method(data, column)` | Returns the suggested detection method name. |
|
|
179
|
+
| `before_cleaning_summary(data, column)` | Returns a DataFrame summary report before cleaning. |
|
|
180
|
+
|
|
181
|
+
### Module Reference
|
|
182
|
+
|
|
183
|
+
| Module | Contents |
|
|
184
|
+
|---|---|
|
|
185
|
+
| `autooutlier.statistics` | `mean`, `median`, `mode`, `std`, `var`, `data_range`, `q1`, `q3`, `iqr`, `skew`, `skew_measurment`, `is_normal`, `kurtosis`, `kurtosis_measurement` |
|
|
186
|
+
| `autooutlier.detection` | `Iqr_method`, `z_score_method`, `modified_z_score`, `percentile_method`, `detect_outlier_method`, `detect_outliers` |
|
|
187
|
+
| `autooutlier.handling` | `winsorization`, `interpolate`, `replace_with_mean`, `replace_with_median`, `replace_with_mode`, `replace_with_custom_value`, `replace_with_forward_fill`, `replace_with_backward_fill`, `remove_outliers`, `detect_handler`, `handle_outliers` |
|
|
188
|
+
| `autooutlier.summary` | `before_cleaning_summary` |
|
|
189
|
+
| `autooutlier.visualization` | `box_plot` |
|
|
190
|
+
| `autooutlier.utils` | `is_numeric`, `is_time_series`, `is_continous`, `outlier_count`, `outlier_percentage` |
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## License
|
|
195
|
+
|
|
196
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## Changelog
|
|
201
|
+
|
|
202
|
+
See [CHANGELOG.md](CHANGELOG.md) for all notable changes.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
autooutlier/__init__.py,sha256=FdNiubZ9h9M_LWc4cSl7wYdvY5DLNV02DOpNSpta5CY,727
|
|
2
|
+
autooutlier/detection.py,sha256=I67FOHk9R6V-AVl4wPkMWLSIV0C2qWFKM21pIR-LRIE,2085
|
|
3
|
+
autooutlier/handling.py,sha256=SwWanqUyMoXZ7hCVKnPuJgRAUMDm3od2sCu3ota4BVI,6103
|
|
4
|
+
autooutlier/statistics.py,sha256=KKZAyzGQie2nk-MeiwhBFv-zojZlV3LFDreU7ntUfc0,1343
|
|
5
|
+
autooutlier/summary.py,sha256=-LR27vQ-CZc9XIAvZXZho2w_M-fULCUByLw9lHO8t_A,817
|
|
6
|
+
autooutlier/utils.py,sha256=Yu-C-sP5_k9sDh7RAUFWD9k6zeGb2vW5khv600wcMNE,590
|
|
7
|
+
autooutlier/version.py,sha256=mo7LZfe2rfqFJJKr1EEhMo40k827XqkGRbCbYwqJupE,90
|
|
8
|
+
autooutlier-0.1.0.dist-info/licenses/LICENSE,sha256=uwwHmyknRbaARKLV6ZVhLLdag_GxRTPqzSZmdX5awjI,1069
|
|
9
|
+
autooutlier-0.1.0.dist-info/METADATA,sha256=WycEeWSmkLmgNkL-2qrk4aB3fd9-zx1pHTVNTR-azP4,7273
|
|
10
|
+
autooutlier-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
11
|
+
autooutlier-0.1.0.dist-info/top_level.txt,sha256=Pk6Zbp6lqeso22EOztDWKbdR15JfI7aDnT_pIv3OxlM,12
|
|
12
|
+
autooutlier-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Suruthika C D
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
autooutlier
|