autooutlier 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. autooutlier-0.1.0/CHANGELOG.md +135 -0
  2. autooutlier-0.1.0/LICENSE +21 -0
  3. autooutlier-0.1.0/MANIFEST.in +28 -0
  4. autooutlier-0.1.0/PKG-INFO +202 -0
  5. autooutlier-0.1.0/README.md +166 -0
  6. autooutlier-0.1.0/autooutlier/__init__.py +26 -0
  7. autooutlier-0.1.0/autooutlier/detection.py +67 -0
  8. autooutlier-0.1.0/autooutlier/handling.py +173 -0
  9. autooutlier-0.1.0/autooutlier/statistics.py +71 -0
  10. autooutlier-0.1.0/autooutlier/summary.py +25 -0
  11. autooutlier-0.1.0/autooutlier/utils.py +21 -0
  12. autooutlier-0.1.0/autooutlier/version.py +4 -0
  13. autooutlier-0.1.0/autooutlier.egg-info/PKG-INFO +202 -0
  14. autooutlier-0.1.0/autooutlier.egg-info/SOURCES.txt +33 -0
  15. autooutlier-0.1.0/autooutlier.egg-info/dependency_links.txt +1 -0
  16. autooutlier-0.1.0/autooutlier.egg-info/requires.txt +5 -0
  17. autooutlier-0.1.0/autooutlier.egg-info/top_level.txt +1 -0
  18. autooutlier-0.1.0/docs/api_reference.md +842 -0
  19. autooutlier-0.1.0/docs/changelog.md +96 -0
  20. autooutlier-0.1.0/docs/detection_methods.md +349 -0
  21. autooutlier-0.1.0/docs/examples.md +432 -0
  22. autooutlier-0.1.0/docs/faq.md +271 -0
  23. autooutlier-0.1.0/docs/handling_methods.md +513 -0
  24. autooutlier-0.1.0/docs/index.md +97 -0
  25. autooutlier-0.1.0/docs/installation.md +157 -0
  26. autooutlier-0.1.0/docs/quickstart.md +171 -0
  27. autooutlier-0.1.0/docs/user_guide.md +393 -0
  28. autooutlier-0.1.0/examples/basic_usage.py +78 -0
  29. autooutlier-0.1.0/pyproject.toml +69 -0
  30. autooutlier-0.1.0/requirements.txt +3 -0
  31. autooutlier-0.1.0/setup.cfg +4 -0
  32. autooutlier-0.1.0/tests/test_detection.py +93 -0
  33. autooutlier-0.1.0/tests/test_handling.py +127 -0
  34. autooutlier-0.1.0/tests/test_statistics.py +124 -0
  35. autooutlier-0.1.0/tests/test_summary.py +42 -0
@@ -0,0 +1,135 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ---
9
+
10
+ ## [0.1.0] - 2026-06-28
11
+
12
+ ### Added
13
+
14
+ #### Core Features
15
+ - Initial public release of **autooutlier**.
16
+ - Automatic outlier detection with intelligent method selection.
17
+ - Automatic outlier handling based on dataset characteristics.
18
+ - Manual selection of detection and handling methods.
19
+ - Support for custom replacement values.
20
+
21
+ #### Statistics Module
22
+ - Descriptive statistics:
23
+ - Mean
24
+ - Median
25
+ - Mode
26
+ - Standard Deviation
27
+ - Variance
28
+ - Data Range
29
+ - Quartile calculations (Q1, Q3, IQR)
30
+ - Skewness analysis
31
+ - Kurtosis analysis
32
+ - Distribution classification
33
+
34
+ #### Detection Module
35
+ Implemented the following outlier detection methods:
36
+
37
+ - IQR Method
38
+ - Z-Score Method
39
+ - Modified Z-Score Method
40
+ - Percentile Method
41
+ - Automatic Detection Method Selection
42
+ - Boolean Outlier Mask Generation
43
+
44
+ #### Handling Module
45
+ Implemented multiple outlier handling strategies:
46
+
47
+ - Winsorization
48
+ - Mean Replacement
49
+ - Median Replacement
50
+ - Mode Replacement
51
+ - Interpolation
52
+ - Forward Fill
53
+ - Backward Fill
54
+ - Custom Value Replacement
55
+ - Outlier Removal
56
+ - Automatic Handling Method Selection
57
+
58
+ #### Summary Module
59
+ Added comprehensive pre-cleaning analysis including:
60
+
61
+ - Recommended detection method
62
+ - Recommended handling method
63
+ - Number of outliers
64
+ - Percentage of outliers
65
+ - Distribution type
66
+ - Skewness report
67
+ - Statistical summary
68
+
69
+ #### Visualization Module
70
+ - Box Plot visualization using Seaborn.
71
+
72
+ #### Utility Module
73
+ Added helper functions for:
74
+
75
+ - Numeric column validation
76
+ - Continuous data validation
77
+ - Time-series detection
78
+ - Outlier counting
79
+ - Outlier percentage calculation
80
+
81
+ #### Documentation
82
+ Added professional project documentation including:
83
+
84
+ - README
85
+ - Installation Guide
86
+ - User Guide
87
+ - API Reference
88
+ - Detection Methods Guide
89
+ - Handling Methods Guide
90
+ - Examples
91
+ - FAQ
92
+ - Changelog
93
+
94
+ #### Examples
95
+ Added practical usage examples demonstrating:
96
+
97
+ - Automatic detection
98
+ - Automatic handling
99
+ - Manual detection
100
+ - Manual handling
101
+ - Summary generation
102
+
103
+ #### Testing
104
+ Added unit tests for:
105
+
106
+ - Detection module
107
+ - Handling module
108
+ - Statistics module
109
+ - Summary module
110
+
111
+ #### Packaging
112
+ Configured the project for distribution using:
113
+
114
+ - pyproject.toml
115
+ - setuptools
116
+ - MANIFEST.in
117
+ - Editable installation support
118
+ - Source distribution (sdist)
119
+ - Wheel distribution
120
+
121
+ #### Project Structure
122
+ Added:
123
+
124
+ - MIT License
125
+ - Version management
126
+ - Documentation folder
127
+ - Examples folder
128
+ - Tests folder
129
+ - Professional package layout
130
+
131
+ ---
132
+
133
+ ## License
134
+
135
+ This project is licensed under the MIT License.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Suruthika C D
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,28 @@
1
+ # ------------------------------------------------------------------
2
+ # MANIFEST.in
3
+ # Files to include in the source distribution (sdist)
4
+ # ------------------------------------------------------------------
5
+
6
+ # Project metadata
7
+ include LICENSE
8
+ include README.md
9
+ include CHANGELOG.md
10
+ include requirements.txt
11
+
12
+ # Include all package source files
13
+ recursive-include autooutlier *.py
14
+
15
+ # Include documentation
16
+ recursive-include docs *.md
17
+
18
+ # Include examples
19
+ recursive-include examples *.py
20
+
21
+ # Include tests
22
+ recursive-include tests *.py
23
+
24
+ # Exclude Python cache files
25
+ global-exclude *.py[cod]
26
+ global-exclude __pycache__
27
+ global-exclude *.so
28
+ global-exclude .DS_Store
@@ -0,0 +1,202 @@
1
+ Metadata-Version: 2.4
2
+ Name: autooutlier
3
+ Version: 0.1.0
4
+ Summary: Automatic outlier detection and handling for Python.
5
+ Author: Suruthika C D
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/suruthika-cd/autooutlier
8
+ Project-URL: Repository, https://github.com/suruthika-cd/autooutlier
9
+ Project-URL: Documentation, https://github.com/suruthika-cd/autooutlier/tree/main/docs
10
+ Project-URL: Issues, https://github.com/suruthika-cd/autooutlier/issues
11
+ Project-URL: Changelog, https://github.com/suruthika-cd/autooutlier/blob/main/CHANGELOG.md
12
+ Keywords: outlier,outlier-detection,statistics,data-cleaning,data-science,machine-learning,preprocessing,pandas
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Topic :: Scientific/Engineering
25
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
+ Requires-Python: >=3.8
28
+ Description-Content-Type: text/markdown
29
+ License-File: LICENSE
30
+ Requires-Dist: numpy>=1.21.0
31
+ Requires-Dist: pandas>=1.3.0
32
+ Requires-Dist: scipy>=1.7.0
33
+ Requires-Dist: matplotlib>=3.4.0
34
+ Requires-Dist: seaborn>=0.11.0
35
+ Dynamic: license-file
36
+
37
+ # autooutlier
38
+
39
+ **Automatic Outlier Detection and Handling for Python**
40
+
41
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
42
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
43
+ [![PyPI version](https://img.shields.io/pypi/v/autooutlier.svg)](https://pypi.org/project/autooutlier/)
44
+
45
+ ---
46
+
47
+ ## Overview
48
+
49
+ `autooutlier` is a Python package that **automatically detects, analyzes, and handles outliers** in numerical data. It intelligently selects the best detection and handling methods based on data distribution — requiring **zero configuration** from the user.
50
+
51
+ Simply pass your DataFrame and column name, and `autooutlier` handles the rest.
52
+
53
+ ---
54
+
55
+ ## Features
56
+
57
+ - **Automatic Detection** — Selects the optimal outlier detection method (Z-Score, Modified Z-Score, IQR, Percentile) based on data skewness.
58
+ - **Automatic Handling** — Chooses the best outlier replacement strategy (winsorization, mean/median/mode replacement, interpolation, etc.) based on data characteristics.
59
+ - **Statistical Analysis** — Provides mean, median, mode, standard deviation, variance, skewness, kurtosis, and distribution classification.
60
+ - **Pre-Cleaning Summary** — Generates a comprehensive report before cleaning, including detection method, handling strategy, outlier count, and percentage.
61
+ - **Post-Cleaning Report** — Returns both the cleaned dataset and an after-cleaning summary report.
62
+ - **Flexible Manual Control** — Override automatic selections with manual detection and handling methods when needed.
63
+ - **Visualization** — Built-in box plot support via Seaborn.
64
+
65
+ ---
66
+
67
+ ## Installation
68
+
69
+ ```bash
70
+ pip install autooutlier
71
+ ```
72
+
73
+ Or install from source:
74
+
75
+ ```bash
76
+ git clone https://github.com/suruthika-cd/autooutlier.git
77
+ cd autooutlier
78
+ pip install -e .
79
+ ```
80
+
81
+ ### Dependencies
82
+
83
+ - Python >= 3.8
84
+ - NumPy >= 1.21.0
85
+ - Pandas >= 1.3.0
86
+ - SciPy >= 1.7.0
87
+ - Seaborn >= 0.11.0
88
+ - Matplotlib >= 3.4.0
89
+
90
+ ---
91
+
92
+ ## Quick Start
93
+
94
+ ```python
95
+ import pandas as pd
96
+ from autooutlier import handle_outliers, before_cleaning_summary, detect_outliers
97
+
98
+ # Load your data
99
+ df = pd.DataFrame({"values": [10, 12, 14, 11, 13, 100, 15, 12, 14, 11]})
100
+
101
+ # Get a pre-cleaning summary report
102
+ summary = before_cleaning_summary(df, "values")
103
+ print(summary)
104
+
105
+ # Automatically detect and handle outliers
106
+ cleaned_data, report = handle_outliers(df, "values")
107
+ print(report)
108
+ print(cleaned_data)
109
+ ```
110
+
111
+ ---
112
+
113
+ ## Usage Examples
114
+
115
+ ### Automatic Outlier Detection
116
+
117
+ ```python
118
+ from autooutlier import detect_outliers
119
+
120
+ outlier_mask = detect_outliers(df, "column_name")
121
+ print(f"Outliers found: {outlier_mask.sum()}")
122
+ ```
123
+
124
+ ### Automatic Outlier Handling
125
+
126
+ ```python
127
+ from autooutlier import handle_outliers
128
+
129
+ # Fully automatic — detection and handling methods are chosen for you
130
+ cleaned_df, report = handle_outliers(df, "column_name")
131
+ ```
132
+
133
+ ### Manual Detection Method
134
+
135
+ ```python
136
+ # Use a specific detection method
137
+ cleaned_df, report = handle_outliers(df, "column_name", detection_method="z_score")
138
+ ```
139
+
140
+ Available detection methods: `'auto'`, `'Iqr_method'`, `'z_score'`, `'modified_z_score'`, `'percentile'`
141
+
142
+ ### Manual Handling Method
143
+
144
+ ```python
145
+ # Use a specific replacement strategy
146
+ cleaned_df, report = handle_outliers(df, "column_name", replacement="median")
147
+ ```
148
+
149
+ Available replacement methods: `'auto'`, `'interpolate'`, `'winsorization'`, `'median'`, `'mean'`, `'mode'`, `'custom'`, `'remove'`, `'bfill'`, `'ffill'`
150
+
151
+ ### Custom Value Replacement
152
+
153
+ ```python
154
+ cleaned_df, report = handle_outliers(df, "column_name", replacement="custom", value=0)
155
+ ```
156
+
157
+ ### Pre-Cleaning Summary
158
+
159
+ ```python
160
+ from autooutlier import before_cleaning_summary
161
+
162
+ summary = before_cleaning_summary(df, "column_name")
163
+ print(summary)
164
+ ```
165
+
166
+ Output includes: suggested detection method, handling method, skewness, distribution type, outlier count, and outlier percentage.
167
+
168
+ ---
169
+
170
+ ## API Overview
171
+
172
+ ### Public API
173
+
174
+ | Function | Description |
175
+ |---|---|
176
+ | `handle_outliers(data, column, detection_method='auto', replacement='auto', value=None)` | Detect and handle outliers. Returns `(cleaned_data, report)`. |
177
+ | `detect_outliers(data, column)` | Detect outliers automatically. Returns a boolean mask. |
178
+ | `detect_outlier_method(data, column)` | Returns the suggested detection method name. |
179
+ | `before_cleaning_summary(data, column)` | Returns a DataFrame summary report before cleaning. |
180
+
181
+ ### Module Reference
182
+
183
+ | Module | Contents |
184
+ |---|---|
185
+ | `autooutlier.statistics` | `mean`, `median`, `mode`, `std`, `var`, `data_range`, `q1`, `q3`, `iqr`, `skew`, `skew_measurment`, `is_normal`, `kurtosis`, `kurtosis_measurement` |
186
+ | `autooutlier.detection` | `Iqr_method`, `z_score_method`, `modified_z_score`, `percentile_method`, `detect_outlier_method`, `detect_outliers` |
187
+ | `autooutlier.handling` | `winsorization`, `interpolate`, `replace_with_mean`, `replace_with_median`, `replace_with_mode`, `replace_with_custom_value`, `replace_with_forward_fill`, `replace_with_backward_fill`, `remove_outliers`, `detect_handler`, `handle_outliers` |
188
+ | `autooutlier.summary` | `before_cleaning_summary` |
189
+ | `autooutlier.visualization` | `box_plot` |
190
+ | `autooutlier.utils` | `is_numeric`, `is_time_series`, `is_continous`, `outlier_count`, `outlier_percentage` |
191
+
192
+ ---
193
+
194
+ ## License
195
+
196
+ This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
197
+
198
+ ---
199
+
200
+ ## Changelog
201
+
202
+ See [CHANGELOG.md](CHANGELOG.md) for all notable changes.
@@ -0,0 +1,166 @@
1
+ # autooutlier
2
+
3
+ **Automatic Outlier Detection and Handling for Python**
4
+
5
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+ [![PyPI version](https://img.shields.io/pypi/v/autooutlier.svg)](https://pypi.org/project/autooutlier/)
8
+
9
+ ---
10
+
11
+ ## Overview
12
+
13
+ `autooutlier` is a Python package that **automatically detects, analyzes, and handles outliers** in numerical data. It intelligently selects the best detection and handling methods based on data distribution — requiring **zero configuration** from the user.
14
+
15
+ Simply pass your DataFrame and column name, and `autooutlier` handles the rest.
16
+
17
+ ---
18
+
19
+ ## Features
20
+
21
+ - **Automatic Detection** — Selects the optimal outlier detection method (Z-Score, Modified Z-Score, IQR, Percentile) based on data skewness.
22
+ - **Automatic Handling** — Chooses the best outlier replacement strategy (winsorization, mean/median/mode replacement, interpolation, etc.) based on data characteristics.
23
+ - **Statistical Analysis** — Provides mean, median, mode, standard deviation, variance, skewness, kurtosis, and distribution classification.
24
+ - **Pre-Cleaning Summary** — Generates a comprehensive report before cleaning, including detection method, handling strategy, outlier count, and percentage.
25
+ - **Post-Cleaning Report** — Returns both the cleaned dataset and an after-cleaning summary report.
26
+ - **Flexible Manual Control** — Override automatic selections with manual detection and handling methods when needed.
27
+ - **Visualization** — Built-in box plot support via Seaborn.
28
+
29
+ ---
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install autooutlier
35
+ ```
36
+
37
+ Or install from source:
38
+
39
+ ```bash
40
+ git clone https://github.com/suruthika-cd/autooutlier.git
41
+ cd autooutlier
42
+ pip install -e .
43
+ ```
44
+
45
+ ### Dependencies
46
+
47
+ - Python >= 3.8
48
+ - NumPy >= 1.21.0
49
+ - Pandas >= 1.3.0
50
+ - SciPy >= 1.7.0
51
+ - Seaborn >= 0.11.0
52
+ - Matplotlib >= 3.4.0
53
+
54
+ ---
55
+
56
+ ## Quick Start
57
+
58
+ ```python
59
+ import pandas as pd
60
+ from autooutlier import handle_outliers, before_cleaning_summary, detect_outliers
61
+
62
+ # Load your data
63
+ df = pd.DataFrame({"values": [10, 12, 14, 11, 13, 100, 15, 12, 14, 11]})
64
+
65
+ # Get a pre-cleaning summary report
66
+ summary = before_cleaning_summary(df, "values")
67
+ print(summary)
68
+
69
+ # Automatically detect and handle outliers
70
+ cleaned_data, report = handle_outliers(df, "values")
71
+ print(report)
72
+ print(cleaned_data)
73
+ ```
74
+
75
+ ---
76
+
77
+ ## Usage Examples
78
+
79
+ ### Automatic Outlier Detection
80
+
81
+ ```python
82
+ from autooutlier import detect_outliers
83
+
84
+ outlier_mask = detect_outliers(df, "column_name")
85
+ print(f"Outliers found: {outlier_mask.sum()}")
86
+ ```
87
+
88
+ ### Automatic Outlier Handling
89
+
90
+ ```python
91
+ from autooutlier import handle_outliers
92
+
93
+ # Fully automatic — detection and handling methods are chosen for you
94
+ cleaned_df, report = handle_outliers(df, "column_name")
95
+ ```
96
+
97
+ ### Manual Detection Method
98
+
99
+ ```python
100
+ # Use a specific detection method
101
+ cleaned_df, report = handle_outliers(df, "column_name", detection_method="z_score")
102
+ ```
103
+
104
+ Available detection methods: `'auto'`, `'Iqr_method'`, `'z_score'`, `'modified_z_score'`, `'percentile'`
105
+
106
+ ### Manual Handling Method
107
+
108
+ ```python
109
+ # Use a specific replacement strategy
110
+ cleaned_df, report = handle_outliers(df, "column_name", replacement="median")
111
+ ```
112
+
113
+ Available replacement methods: `'auto'`, `'interpolate'`, `'winsorization'`, `'median'`, `'mean'`, `'mode'`, `'custom'`, `'remove'`, `'bfill'`, `'ffill'`
114
+
115
+ ### Custom Value Replacement
116
+
117
+ ```python
118
+ cleaned_df, report = handle_outliers(df, "column_name", replacement="custom", value=0)
119
+ ```
120
+
121
+ ### Pre-Cleaning Summary
122
+
123
+ ```python
124
+ from autooutlier import before_cleaning_summary
125
+
126
+ summary = before_cleaning_summary(df, "column_name")
127
+ print(summary)
128
+ ```
129
+
130
+ Output includes: suggested detection method, handling method, skewness, distribution type, outlier count, and outlier percentage.
131
+
132
+ ---
133
+
134
+ ## API Overview
135
+
136
+ ### Public API
137
+
138
+ | Function | Description |
139
+ |---|---|
140
+ | `handle_outliers(data, column, detection_method='auto', replacement='auto', value=None)` | Detect and handle outliers. Returns `(cleaned_data, report)`. |
141
+ | `detect_outliers(data, column)` | Detect outliers automatically. Returns a boolean mask. |
142
+ | `detect_outlier_method(data, column)` | Returns the suggested detection method name. |
143
+ | `before_cleaning_summary(data, column)` | Returns a DataFrame summary report before cleaning. |
144
+
145
+ ### Module Reference
146
+
147
+ | Module | Contents |
148
+ |---|---|
149
+ | `autooutlier.statistics` | `mean`, `median`, `mode`, `std`, `var`, `data_range`, `q1`, `q3`, `iqr`, `skew`, `skew_measurment`, `is_normal`, `kurtosis`, `kurtosis_measurement` |
150
+ | `autooutlier.detection` | `Iqr_method`, `z_score_method`, `modified_z_score`, `percentile_method`, `detect_outlier_method`, `detect_outliers` |
151
+ | `autooutlier.handling` | `winsorization`, `interpolate`, `replace_with_mean`, `replace_with_median`, `replace_with_mode`, `replace_with_custom_value`, `replace_with_forward_fill`, `replace_with_backward_fill`, `remove_outliers`, `detect_handler`, `handle_outliers` |
152
+ | `autooutlier.summary` | `before_cleaning_summary` |
153
+ | `autooutlier.visualization` | `box_plot` |
154
+ | `autooutlier.utils` | `is_numeric`, `is_time_series`, `is_continous`, `outlier_count`, `outlier_percentage` |
155
+
156
+ ---
157
+
158
+ ## License
159
+
160
+ This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
161
+
162
+ ---
163
+
164
+ ## Changelog
165
+
166
+ See [CHANGELOG.md](CHANGELOG.md) for all notable changes.
@@ -0,0 +1,26 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ autooutlier - Automatic Outlier Detection and Handling for Python
4
+ =================================================================
5
+
6
+ A professional Python package for automatic outlier detection,
7
+ handling, and statistical analysis of numerical data.
8
+
9
+ Usage:
10
+ >>> import autooutlier
11
+ >>> from autooutlier import handle_outliers, detect_outliers
12
+ """
13
+
14
+ from .handling import handle_outliers
15
+ from .summary import before_cleaning_summary
16
+ from .detection import detect_outliers
17
+ from .detection import detect_outlier_method
18
+ from .version import __version__
19
+
20
+ __all__ = [
21
+ "handle_outliers",
22
+ "before_cleaning_summary",
23
+ "detect_outliers",
24
+ "detect_outlier_method",
25
+ "__version__",
26
+ ]
@@ -0,0 +1,67 @@
1
+ # -*- coding: utf-8 -*-
2
+ """Outlier detection methods."""
3
+
4
+ import numpy as np
5
+
6
+ from .statistics import mean, std, q1, q3, iqr, skew_measurment
7
+ from .utils import is_numeric
8
+
9
+
10
+ def Iqr_method(data,column):
11
+ q1_value=q1(data[column])
12
+ q3_value=q3(data[column])
13
+ IQR=iqr(data[column])
14
+ LowerFence=q1_value-1.5*IQR
15
+ upperFence=q3_value+1.5*IQR
16
+ outliers=(data[column]>upperFence) | (data[column]<LowerFence)
17
+ return outliers
18
+
19
+ def z_score_method(data,column):
20
+ mean_value=mean(data[column])
21
+ std_value=std(data[column])
22
+ if std_value==0:
23
+ return np.zeros(len(data),dtype=bool)
24
+ z_score=(data[column]-mean_value)/std_value
25
+ outliers=abs(z_score)>3
26
+ return outliers
27
+
28
+ def modified_z_score(data,column):
29
+ median_value=data[column].median()
30
+ absolute_value=abs(data[column]-median_value)
31
+ MAD=absolute_value.median()
32
+ if MAD==0:
33
+ return np.zeros(len(data),dtype=bool)
34
+ modified_z_score=0.6745*(data[column]-median_value)/MAD
35
+ outliers=abs(modified_z_score)>3.5
36
+ return outliers
37
+
38
+ def percentile_method(data,column):
39
+ lower_limit=np.percentile(data[column],5)
40
+ upper_limit=np.percentile(data[column],95)
41
+ outliers=(data[column]>upper_limit) | (data[column]<lower_limit)
42
+ return outliers
43
+
44
+ def detect_outlier_method(data,column):
45
+ numeric=is_numeric(data,column)
46
+ if not numeric :
47
+ return "It is not a numerical Column"
48
+ distribution=skew_measurment(data[column])
49
+ if distribution in ['Perfectly Symmetric','Approximately Symmetric']:
50
+ detection_method='z_score'
51
+ elif distribution in ['Highly Right Skewed','Highly Left Skewed','Moderately Right Skewed','Moderately Left Skewed']:
52
+ detection_method='modified_z_score'
53
+ else:
54
+ detection_method='Iqr_method'
55
+ return detection_method
56
+
57
+ def detect_outliers(data,column):
58
+ method=detect_outlier_method(data,column)
59
+
60
+ if method == 'z_score':
61
+ return z_score_method(data,column)
62
+
63
+ elif method == 'modified_z_score':
64
+ return modified_z_score(data,column)
65
+
66
+ else:
67
+ return Iqr_method(data,column)