autooutlier 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autooutlier-0.1.0/CHANGELOG.md +135 -0
- autooutlier-0.1.0/LICENSE +21 -0
- autooutlier-0.1.0/MANIFEST.in +28 -0
- autooutlier-0.1.0/PKG-INFO +202 -0
- autooutlier-0.1.0/README.md +166 -0
- autooutlier-0.1.0/autooutlier/__init__.py +26 -0
- autooutlier-0.1.0/autooutlier/detection.py +67 -0
- autooutlier-0.1.0/autooutlier/handling.py +173 -0
- autooutlier-0.1.0/autooutlier/statistics.py +71 -0
- autooutlier-0.1.0/autooutlier/summary.py +25 -0
- autooutlier-0.1.0/autooutlier/utils.py +21 -0
- autooutlier-0.1.0/autooutlier/version.py +4 -0
- autooutlier-0.1.0/autooutlier.egg-info/PKG-INFO +202 -0
- autooutlier-0.1.0/autooutlier.egg-info/SOURCES.txt +33 -0
- autooutlier-0.1.0/autooutlier.egg-info/dependency_links.txt +1 -0
- autooutlier-0.1.0/autooutlier.egg-info/requires.txt +5 -0
- autooutlier-0.1.0/autooutlier.egg-info/top_level.txt +1 -0
- autooutlier-0.1.0/docs/api_reference.md +842 -0
- autooutlier-0.1.0/docs/changelog.md +96 -0
- autooutlier-0.1.0/docs/detection_methods.md +349 -0
- autooutlier-0.1.0/docs/examples.md +432 -0
- autooutlier-0.1.0/docs/faq.md +271 -0
- autooutlier-0.1.0/docs/handling_methods.md +513 -0
- autooutlier-0.1.0/docs/index.md +97 -0
- autooutlier-0.1.0/docs/installation.md +157 -0
- autooutlier-0.1.0/docs/quickstart.md +171 -0
- autooutlier-0.1.0/docs/user_guide.md +393 -0
- autooutlier-0.1.0/examples/basic_usage.py +78 -0
- autooutlier-0.1.0/pyproject.toml +69 -0
- autooutlier-0.1.0/requirements.txt +3 -0
- autooutlier-0.1.0/setup.cfg +4 -0
- autooutlier-0.1.0/tests/test_detection.py +93 -0
- autooutlier-0.1.0/tests/test_handling.py +127 -0
- autooutlier-0.1.0/tests/test_statistics.py +124 -0
- autooutlier-0.1.0/tests/test_summary.py +42 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2026-06-28
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
#### Core Features
|
|
15
|
+
- Initial public release of **autooutlier**.
|
|
16
|
+
- Automatic outlier detection with intelligent method selection.
|
|
17
|
+
- Automatic outlier handling based on dataset characteristics.
|
|
18
|
+
- Manual selection of detection and handling methods.
|
|
19
|
+
- Support for custom replacement values.
|
|
20
|
+
|
|
21
|
+
#### Statistics Module
|
|
22
|
+
- Descriptive statistics:
|
|
23
|
+
- Mean
|
|
24
|
+
- Median
|
|
25
|
+
- Mode
|
|
26
|
+
- Standard Deviation
|
|
27
|
+
- Variance
|
|
28
|
+
- Data Range
|
|
29
|
+
- Quartile calculations (Q1, Q3, IQR)
|
|
30
|
+
- Skewness analysis
|
|
31
|
+
- Kurtosis analysis
|
|
32
|
+
- Distribution classification
|
|
33
|
+
|
|
34
|
+
#### Detection Module
|
|
35
|
+
Implemented the following outlier detection methods:
|
|
36
|
+
|
|
37
|
+
- IQR Method
|
|
38
|
+
- Z-Score Method
|
|
39
|
+
- Modified Z-Score Method
|
|
40
|
+
- Percentile Method
|
|
41
|
+
- Automatic Detection Method Selection
|
|
42
|
+
- Boolean Outlier Mask Generation
|
|
43
|
+
|
|
44
|
+
#### Handling Module
|
|
45
|
+
Implemented multiple outlier handling strategies:
|
|
46
|
+
|
|
47
|
+
- Winsorization
|
|
48
|
+
- Mean Replacement
|
|
49
|
+
- Median Replacement
|
|
50
|
+
- Mode Replacement
|
|
51
|
+
- Interpolation
|
|
52
|
+
- Forward Fill
|
|
53
|
+
- Backward Fill
|
|
54
|
+
- Custom Value Replacement
|
|
55
|
+
- Outlier Removal
|
|
56
|
+
- Automatic Handling Method Selection
|
|
57
|
+
|
|
58
|
+
#### Summary Module
|
|
59
|
+
Added comprehensive pre-cleaning analysis including:
|
|
60
|
+
|
|
61
|
+
- Recommended detection method
|
|
62
|
+
- Recommended handling method
|
|
63
|
+
- Number of outliers
|
|
64
|
+
- Percentage of outliers
|
|
65
|
+
- Distribution type
|
|
66
|
+
- Skewness report
|
|
67
|
+
- Statistical summary
|
|
68
|
+
|
|
69
|
+
#### Visualization Module
|
|
70
|
+
- Box Plot visualization using Seaborn.
|
|
71
|
+
|
|
72
|
+
#### Utility Module
|
|
73
|
+
Added helper functions for:
|
|
74
|
+
|
|
75
|
+
- Numeric column validation
|
|
76
|
+
- Continuous data validation
|
|
77
|
+
- Time-series detection
|
|
78
|
+
- Outlier counting
|
|
79
|
+
- Outlier percentage calculation
|
|
80
|
+
|
|
81
|
+
#### Documentation
|
|
82
|
+
Added professional project documentation including:
|
|
83
|
+
|
|
84
|
+
- README
|
|
85
|
+
- Installation Guide
|
|
86
|
+
- User Guide
|
|
87
|
+
- API Reference
|
|
88
|
+
- Detection Methods Guide
|
|
89
|
+
- Handling Methods Guide
|
|
90
|
+
- Examples
|
|
91
|
+
- FAQ
|
|
92
|
+
- Changelog
|
|
93
|
+
|
|
94
|
+
#### Examples
|
|
95
|
+
Added practical usage examples demonstrating:
|
|
96
|
+
|
|
97
|
+
- Automatic detection
|
|
98
|
+
- Automatic handling
|
|
99
|
+
- Manual detection
|
|
100
|
+
- Manual handling
|
|
101
|
+
- Summary generation
|
|
102
|
+
|
|
103
|
+
#### Testing
|
|
104
|
+
Added unit tests for:
|
|
105
|
+
|
|
106
|
+
- Detection module
|
|
107
|
+
- Handling module
|
|
108
|
+
- Statistics module
|
|
109
|
+
- Summary module
|
|
110
|
+
|
|
111
|
+
#### Packaging
|
|
112
|
+
Configured the project for distribution using:
|
|
113
|
+
|
|
114
|
+
- pyproject.toml
|
|
115
|
+
- setuptools
|
|
116
|
+
- MANIFEST.in
|
|
117
|
+
- Editable installation support
|
|
118
|
+
- Source distribution (sdist)
|
|
119
|
+
- Wheel distribution
|
|
120
|
+
|
|
121
|
+
#### Project Structure
|
|
122
|
+
Added:
|
|
123
|
+
|
|
124
|
+
- MIT License
|
|
125
|
+
- Version management
|
|
126
|
+
- Documentation folder
|
|
127
|
+
- Examples folder
|
|
128
|
+
- Tests folder
|
|
129
|
+
- Professional package layout
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## License
|
|
134
|
+
|
|
135
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Suruthika C D
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# ------------------------------------------------------------------
|
|
2
|
+
# MANIFEST.in
|
|
3
|
+
# Files to include in the source distribution (sdist)
|
|
4
|
+
# ------------------------------------------------------------------
|
|
5
|
+
|
|
6
|
+
# Project metadata
|
|
7
|
+
include LICENSE
|
|
8
|
+
include README.md
|
|
9
|
+
include CHANGELOG.md
|
|
10
|
+
include requirements.txt
|
|
11
|
+
|
|
12
|
+
# Include all package source files
|
|
13
|
+
recursive-include autooutlier *.py
|
|
14
|
+
|
|
15
|
+
# Include documentation
|
|
16
|
+
recursive-include docs *.md
|
|
17
|
+
|
|
18
|
+
# Include examples
|
|
19
|
+
recursive-include examples *.py
|
|
20
|
+
|
|
21
|
+
# Include tests
|
|
22
|
+
recursive-include tests *.py
|
|
23
|
+
|
|
24
|
+
# Exclude Python cache files
|
|
25
|
+
global-exclude *.py[cod]
|
|
26
|
+
global-exclude __pycache__
|
|
27
|
+
global-exclude *.so
|
|
28
|
+
global-exclude .DS_Store
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: autooutlier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automatic outlier detection and handling for Python.
|
|
5
|
+
Author: Suruthika C D
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/suruthika-cd/autooutlier
|
|
8
|
+
Project-URL: Repository, https://github.com/suruthika-cd/autooutlier
|
|
9
|
+
Project-URL: Documentation, https://github.com/suruthika-cd/autooutlier/tree/main/docs
|
|
10
|
+
Project-URL: Issues, https://github.com/suruthika-cd/autooutlier/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/suruthika-cd/autooutlier/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: outlier,outlier-detection,statistics,data-cleaning,data-science,machine-learning,preprocessing,pandas
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
|
+
Requires-Python: >=3.8
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Dist: numpy>=1.21.0
|
|
31
|
+
Requires-Dist: pandas>=1.3.0
|
|
32
|
+
Requires-Dist: scipy>=1.7.0
|
|
33
|
+
Requires-Dist: matplotlib>=3.4.0
|
|
34
|
+
Requires-Dist: seaborn>=0.11.0
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# autooutlier
|
|
38
|
+
|
|
39
|
+
**Automatic Outlier Detection and Handling for Python**
|
|
40
|
+
|
|
41
|
+
[](https://www.python.org/downloads/)
|
|
42
|
+
[](https://opensource.org/licenses/MIT)
|
|
43
|
+
[](https://pypi.org/project/autooutlier/)
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Overview
|
|
48
|
+
|
|
49
|
+
`autooutlier` is a Python package that **automatically detects, analyzes, and handles outliers** in numerical data. It intelligently selects the best detection and handling methods based on data distribution — requiring **zero configuration** from the user.
|
|
50
|
+
|
|
51
|
+
Simply pass your DataFrame and column name, and `autooutlier` handles the rest.
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Features
|
|
56
|
+
|
|
57
|
+
- **Automatic Detection** — Selects the optimal outlier detection method (Z-Score, Modified Z-Score, IQR, Percentile) based on data skewness.
|
|
58
|
+
- **Automatic Handling** — Chooses the best outlier replacement strategy (winsorization, mean/median/mode replacement, interpolation, etc.) based on data characteristics.
|
|
59
|
+
- **Statistical Analysis** — Provides mean, median, mode, standard deviation, variance, skewness, kurtosis, and distribution classification.
|
|
60
|
+
- **Pre-Cleaning Summary** — Generates a comprehensive report before cleaning, including detection method, handling strategy, outlier count, and percentage.
|
|
61
|
+
- **Post-Cleaning Report** — Returns both the cleaned dataset and an after-cleaning summary report.
|
|
62
|
+
- **Flexible Manual Control** — Override automatic selections with manual detection and handling methods when needed.
|
|
63
|
+
- **Visualization** — Built-in box plot support via Seaborn.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install autooutlier
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or install from source:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
git clone https://github.com/suruthika-cd/autooutlier.git
|
|
77
|
+
cd autooutlier
|
|
78
|
+
pip install -e .
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Dependencies
|
|
82
|
+
|
|
83
|
+
- Python >= 3.8
|
|
84
|
+
- NumPy >= 1.21.0
|
|
85
|
+
- Pandas >= 1.3.0
|
|
86
|
+
- SciPy >= 1.7.0
|
|
87
|
+
- Seaborn >= 0.11.0
|
|
88
|
+
- Matplotlib >= 3.4.0
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Quick Start
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
import pandas as pd
|
|
96
|
+
from autooutlier import handle_outliers, before_cleaning_summary, detect_outliers
|
|
97
|
+
|
|
98
|
+
# Load your data
|
|
99
|
+
df = pd.DataFrame({"values": [10, 12, 14, 11, 13, 100, 15, 12, 14, 11]})
|
|
100
|
+
|
|
101
|
+
# Get a pre-cleaning summary report
|
|
102
|
+
summary = before_cleaning_summary(df, "values")
|
|
103
|
+
print(summary)
|
|
104
|
+
|
|
105
|
+
# Automatically detect and handle outliers
|
|
106
|
+
cleaned_data, report = handle_outliers(df, "values")
|
|
107
|
+
print(report)
|
|
108
|
+
print(cleaned_data)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Usage Examples
|
|
114
|
+
|
|
115
|
+
### Automatic Outlier Detection
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from autooutlier import detect_outliers
|
|
119
|
+
|
|
120
|
+
outlier_mask = detect_outliers(df, "column_name")
|
|
121
|
+
print(f"Outliers found: {outlier_mask.sum()}")
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Automatic Outlier Handling
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from autooutlier import handle_outliers
|
|
128
|
+
|
|
129
|
+
# Fully automatic — detection and handling methods are chosen for you
|
|
130
|
+
cleaned_df, report = handle_outliers(df, "column_name")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Manual Detection Method
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
# Use a specific detection method
|
|
137
|
+
cleaned_df, report = handle_outliers(df, "column_name", detection_method="z_score")
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Available detection methods: `'auto'`, `'Iqr_method'`, `'z_score'`, `'modified_z_score'`, `'percentile'`
|
|
141
|
+
|
|
142
|
+
### Manual Handling Method
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
# Use a specific replacement strategy
|
|
146
|
+
cleaned_df, report = handle_outliers(df, "column_name", replacement="median")
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Available replacement methods: `'auto'`, `'interpolate'`, `'winsorization'`, `'median'`, `'mean'`, `'mode'`, `'custom'`, `'remove'`, `'bfill'`, `'ffill'`
|
|
150
|
+
|
|
151
|
+
### Custom Value Replacement
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
cleaned_df, report = handle_outliers(df, "column_name", replacement="custom", value=0)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Pre-Cleaning Summary
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from autooutlier import before_cleaning_summary
|
|
161
|
+
|
|
162
|
+
summary = before_cleaning_summary(df, "column_name")
|
|
163
|
+
print(summary)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Output includes: suggested detection method, handling method, skewness, distribution type, outlier count, and outlier percentage.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## API Overview
|
|
171
|
+
|
|
172
|
+
### Public API
|
|
173
|
+
|
|
174
|
+
| Function | Description |
|
|
175
|
+
|---|---|
|
|
176
|
+
| `handle_outliers(data, column, detection_method='auto', replacement='auto', value=None)` | Detect and handle outliers. Returns `(cleaned_data, report)`. |
|
|
177
|
+
| `detect_outliers(data, column)` | Detect outliers automatically. Returns a boolean mask. |
|
|
178
|
+
| `detect_outlier_method(data, column)` | Returns the suggested detection method name. |
|
|
179
|
+
| `before_cleaning_summary(data, column)` | Returns a DataFrame summary report before cleaning. |
|
|
180
|
+
|
|
181
|
+
### Module Reference
|
|
182
|
+
|
|
183
|
+
| Module | Contents |
|
|
184
|
+
|---|---|
|
|
185
|
+
| `autooutlier.statistics` | `mean`, `median`, `mode`, `std`, `var`, `data_range`, `q1`, `q3`, `iqr`, `skew`, `skew_measurment`, `is_normal`, `kurtosis`, `kurtosis_measurement` |
|
|
186
|
+
| `autooutlier.detection` | `Iqr_method`, `z_score_method`, `modified_z_score`, `percentile_method`, `detect_outlier_method`, `detect_outliers` |
|
|
187
|
+
| `autooutlier.handling` | `winsorization`, `interpolate`, `replace_with_mean`, `replace_with_median`, `replace_with_mode`, `replace_with_custom_value`, `replace_with_forward_fill`, `replace_with_backward_fill`, `remove_outliers`, `detect_handler`, `handle_outliers` |
|
|
188
|
+
| `autooutlier.summary` | `before_cleaning_summary` |
|
|
189
|
+
| `autooutlier.visualization` | `box_plot` |
|
|
190
|
+
| `autooutlier.utils` | `is_numeric`, `is_time_series`, `is_continous`, `outlier_count`, `outlier_percentage` |
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## License
|
|
195
|
+
|
|
196
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## Changelog
|
|
201
|
+
|
|
202
|
+
See [CHANGELOG.md](CHANGELOG.md) for all notable changes.
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# autooutlier
|
|
2
|
+
|
|
3
|
+
**Automatic Outlier Detection and Handling for Python**
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[](https://pypi.org/project/autooutlier/)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Overview
|
|
12
|
+
|
|
13
|
+
`autooutlier` is a Python package that **automatically detects, analyzes, and handles outliers** in numerical data. It intelligently selects the best detection and handling methods based on data distribution — requiring **zero configuration** from the user.
|
|
14
|
+
|
|
15
|
+
Simply pass your DataFrame and column name, and `autooutlier` handles the rest.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- **Automatic Detection** — Selects the optimal outlier detection method (Z-Score, Modified Z-Score, IQR, Percentile) based on data skewness.
|
|
22
|
+
- **Automatic Handling** — Chooses the best outlier replacement strategy (winsorization, mean/median/mode replacement, interpolation, etc.) based on data characteristics.
|
|
23
|
+
- **Statistical Analysis** — Provides mean, median, mode, standard deviation, variance, skewness, kurtosis, and distribution classification.
|
|
24
|
+
- **Pre-Cleaning Summary** — Generates a comprehensive report before cleaning, including detection method, handling strategy, outlier count, and percentage.
|
|
25
|
+
- **Post-Cleaning Report** — Returns both the cleaned dataset and an after-cleaning summary report.
|
|
26
|
+
- **Flexible Manual Control** — Override automatic selections with manual detection and handling methods when needed.
|
|
27
|
+
- **Visualization** — Built-in box plot support via Seaborn.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install autooutlier
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Or install from source:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
git clone https://github.com/suruthika-cd/autooutlier.git
|
|
41
|
+
cd autooutlier
|
|
42
|
+
pip install -e .
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Dependencies
|
|
46
|
+
|
|
47
|
+
- Python >= 3.8
|
|
48
|
+
- NumPy >= 1.21.0
|
|
49
|
+
- Pandas >= 1.3.0
|
|
50
|
+
- SciPy >= 1.7.0
|
|
51
|
+
- Seaborn >= 0.11.0
|
|
52
|
+
- Matplotlib >= 3.4.0
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Quick Start
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
import pandas as pd
|
|
60
|
+
from autooutlier import handle_outliers, before_cleaning_summary, detect_outliers
|
|
61
|
+
|
|
62
|
+
# Load your data
|
|
63
|
+
df = pd.DataFrame({"values": [10, 12, 14, 11, 13, 100, 15, 12, 14, 11]})
|
|
64
|
+
|
|
65
|
+
# Get a pre-cleaning summary report
|
|
66
|
+
summary = before_cleaning_summary(df, "values")
|
|
67
|
+
print(summary)
|
|
68
|
+
|
|
69
|
+
# Automatically detect and handle outliers
|
|
70
|
+
cleaned_data, report = handle_outliers(df, "values")
|
|
71
|
+
print(report)
|
|
72
|
+
print(cleaned_data)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Usage Examples
|
|
78
|
+
|
|
79
|
+
### Automatic Outlier Detection
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from autooutlier import detect_outliers
|
|
83
|
+
|
|
84
|
+
outlier_mask = detect_outliers(df, "column_name")
|
|
85
|
+
print(f"Outliers found: {outlier_mask.sum()}")
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Automatic Outlier Handling
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from autooutlier import handle_outliers
|
|
92
|
+
|
|
93
|
+
# Fully automatic — detection and handling methods are chosen for you
|
|
94
|
+
cleaned_df, report = handle_outliers(df, "column_name")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Manual Detection Method
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
# Use a specific detection method
|
|
101
|
+
cleaned_df, report = handle_outliers(df, "column_name", detection_method="z_score")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Available detection methods: `'auto'`, `'Iqr_method'`, `'z_score'`, `'modified_z_score'`, `'percentile'`
|
|
105
|
+
|
|
106
|
+
### Manual Handling Method
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
# Use a specific replacement strategy
|
|
110
|
+
cleaned_df, report = handle_outliers(df, "column_name", replacement="median")
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Available replacement methods: `'auto'`, `'interpolate'`, `'winsorization'`, `'median'`, `'mean'`, `'mode'`, `'custom'`, `'remove'`, `'bfill'`, `'ffill'`
|
|
114
|
+
|
|
115
|
+
### Custom Value Replacement
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
cleaned_df, report = handle_outliers(df, "column_name", replacement="custom", value=0)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Pre-Cleaning Summary
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from autooutlier import before_cleaning_summary
|
|
125
|
+
|
|
126
|
+
summary = before_cleaning_summary(df, "column_name")
|
|
127
|
+
print(summary)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Output includes: suggested detection method, handling method, skewness, distribution type, outlier count, and outlier percentage.
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## API Overview
|
|
135
|
+
|
|
136
|
+
### Public API
|
|
137
|
+
|
|
138
|
+
| Function | Description |
|
|
139
|
+
|---|---|
|
|
140
|
+
| `handle_outliers(data, column, detection_method='auto', replacement='auto', value=None)` | Detect and handle outliers. Returns `(cleaned_data, report)`. |
|
|
141
|
+
| `detect_outliers(data, column)` | Detect outliers automatically. Returns a boolean mask. |
|
|
142
|
+
| `detect_outlier_method(data, column)` | Returns the suggested detection method name. |
|
|
143
|
+
| `before_cleaning_summary(data, column)` | Returns a DataFrame summary report before cleaning. |
|
|
144
|
+
|
|
145
|
+
### Module Reference
|
|
146
|
+
|
|
147
|
+
| Module | Contents |
|
|
148
|
+
|---|---|
|
|
149
|
+
| `autooutlier.statistics` | `mean`, `median`, `mode`, `std`, `var`, `data_range`, `q1`, `q3`, `iqr`, `skew`, `skew_measurment`, `is_normal`, `kurtosis`, `kurtosis_measurement` |
|
|
150
|
+
| `autooutlier.detection` | `Iqr_method`, `z_score_method`, `modified_z_score`, `percentile_method`, `detect_outlier_method`, `detect_outliers` |
|
|
151
|
+
| `autooutlier.handling` | `winsorization`, `interpolate`, `replace_with_mean`, `replace_with_median`, `replace_with_mode`, `replace_with_custom_value`, `replace_with_forward_fill`, `replace_with_backward_fill`, `remove_outliers`, `detect_handler`, `handle_outliers` |
|
|
152
|
+
| `autooutlier.summary` | `before_cleaning_summary` |
|
|
153
|
+
| `autooutlier.visualization` | `box_plot` |
|
|
154
|
+
| `autooutlier.utils` | `is_numeric`, `is_time_series`, `is_continous`, `outlier_count`, `outlier_percentage` |
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## License
|
|
159
|
+
|
|
160
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## Changelog
|
|
165
|
+
|
|
166
|
+
See [CHANGELOG.md](CHANGELOG.md) for all notable changes.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
autooutlier - Automatic Outlier Detection and Handling for Python
|
|
4
|
+
=================================================================
|
|
5
|
+
|
|
6
|
+
A professional Python package for automatic outlier detection,
|
|
7
|
+
handling, and statistical analysis of numerical data.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
>>> import autooutlier
|
|
11
|
+
>>> from autooutlier import handle_outliers, detect_outliers
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .handling import handle_outliers
|
|
15
|
+
from .summary import before_cleaning_summary
|
|
16
|
+
from .detection import detect_outliers
|
|
17
|
+
from .detection import detect_outlier_method
|
|
18
|
+
from .version import __version__
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"handle_outliers",
|
|
22
|
+
"before_cleaning_summary",
|
|
23
|
+
"detect_outliers",
|
|
24
|
+
"detect_outlier_method",
|
|
25
|
+
"__version__",
|
|
26
|
+
]
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Outlier detection methods."""
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from .statistics import mean, std, q1, q3, iqr, skew_measurment
|
|
7
|
+
from .utils import is_numeric
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def Iqr_method(data,column):
|
|
11
|
+
q1_value=q1(data[column])
|
|
12
|
+
q3_value=q3(data[column])
|
|
13
|
+
IQR=iqr(data[column])
|
|
14
|
+
LowerFence=q1_value-1.5*IQR
|
|
15
|
+
upperFence=q3_value+1.5*IQR
|
|
16
|
+
outliers=(data[column]>upperFence) | (data[column]<LowerFence)
|
|
17
|
+
return outliers
|
|
18
|
+
|
|
19
|
+
def z_score_method(data,column):
|
|
20
|
+
mean_value=mean(data[column])
|
|
21
|
+
std_value=std(data[column])
|
|
22
|
+
if std_value==0:
|
|
23
|
+
return np.zeros(len(data),dtype=bool)
|
|
24
|
+
z_score=(data[column]-mean_value)/std_value
|
|
25
|
+
outliers=abs(z_score)>3
|
|
26
|
+
return outliers
|
|
27
|
+
|
|
28
|
+
def modified_z_score(data,column):
|
|
29
|
+
median_value=data[column].median()
|
|
30
|
+
absolute_value=abs(data[column]-median_value)
|
|
31
|
+
MAD=absolute_value.median()
|
|
32
|
+
if MAD==0:
|
|
33
|
+
return np.zeros(len(data),dtype=bool)
|
|
34
|
+
modified_z_score=0.6745*(data[column]-median_value)/MAD
|
|
35
|
+
outliers=abs(modified_z_score)>3.5
|
|
36
|
+
return outliers
|
|
37
|
+
|
|
38
|
+
def percentile_method(data,column):
|
|
39
|
+
lower_limit=np.percentile(data[column],5)
|
|
40
|
+
upper_limit=np.percentile(data[column],95)
|
|
41
|
+
outliers=(data[column]>upper_limit) | (data[column]<lower_limit)
|
|
42
|
+
return outliers
|
|
43
|
+
|
|
44
|
+
def detect_outlier_method(data,column):
|
|
45
|
+
numeric=is_numeric(data,column)
|
|
46
|
+
if not numeric :
|
|
47
|
+
return "It is not a numerical Column"
|
|
48
|
+
distribution=skew_measurment(data[column])
|
|
49
|
+
if distribution in ['Perfectly Symmetric','Approximately Symmetric']:
|
|
50
|
+
detection_method='z_score'
|
|
51
|
+
elif distribution in ['Highly Right Skewed','Highly Left Skewed','Moderately Right Skewed','Moderately Left Skewed']:
|
|
52
|
+
detection_method='modified_z_score'
|
|
53
|
+
else:
|
|
54
|
+
detection_method='Iqr_method'
|
|
55
|
+
return detection_method
|
|
56
|
+
|
|
57
|
+
def detect_outliers(data,column):
|
|
58
|
+
method=detect_outlier_method(data,column)
|
|
59
|
+
|
|
60
|
+
if method == 'z_score':
|
|
61
|
+
return z_score_method(data,column)
|
|
62
|
+
|
|
63
|
+
elif method == 'modified_z_score':
|
|
64
|
+
return modified_z_score(data,column)
|
|
65
|
+
|
|
66
|
+
else:
|
|
67
|
+
return Iqr_method(data,column)
|