dsr-data-tools 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsr_data_tools-0.0.1/LICENSE +21 -0
- dsr_data_tools-0.0.1/PKG-INFO +66 -0
- dsr_data_tools-0.0.1/README.md +39 -0
- dsr_data_tools-0.0.1/pyproject.toml +43 -0
- dsr_data_tools-0.0.1/setup.cfg +4 -0
- dsr_data_tools-0.0.1/src/dsr_data_tools/__init__.py +33 -0
- dsr_data_tools-0.0.1/src/dsr_data_tools/analysis.py +467 -0
- dsr_data_tools-0.0.1/src/dsr_data_tools/enums.py +81 -0
- dsr_data_tools-0.0.1/src/dsr_data_tools/recommendations.py +537 -0
- dsr_data_tools-0.0.1/src/dsr_data_tools.egg-info/PKG-INFO +66 -0
- dsr_data_tools-0.0.1/src/dsr_data_tools.egg-info/SOURCES.txt +12 -0
- dsr_data_tools-0.0.1/src/dsr_data_tools.egg-info/dependency_links.txt +1 -0
- dsr_data_tools-0.0.1/src/dsr_data_tools.egg-info/requires.txt +8 -0
- dsr_data_tools-0.0.1/src/dsr_data_tools.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Scott Roberts
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dsr-data-tools
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Generic data handling utilities including data splitting and analysis.
|
|
5
|
+
Author-email: Scott Roberts <scottrdeveloper@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: data,splitting,analysis,ml-data
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: dsr-utils>=0.0.1
|
|
20
|
+
Requires-Dist: numpy>=2.0.0
|
|
21
|
+
Requires-Dist: pandas>=2.0.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
24
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
25
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# dsr-data-tools
|
|
29
|
+
|
|
30
|
+
Data analysis and exploration tools for exploratory data analysis (EDA).
|
|
31
|
+
|
|
32
|
+
## Features
|
|
33
|
+
|
|
34
|
+
- **Dataset Analysis**: Comprehensive statistical summaries and data quality assessment
|
|
35
|
+
- **Data Exploration**: Tools for understanding data distributions, correlations, and patterns
|
|
36
|
+
- **Quality Metrics**: Missing value detection, data type analysis, and anomaly identification
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install dsr-data-tools
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Usage
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import pandas as pd
|
|
48
|
+
from dsr_data_tools import analyze_dataset
|
|
49
|
+
|
|
50
|
+
# Load your data
|
|
51
|
+
df = pd.read_csv('data.csv')
|
|
52
|
+
|
|
53
|
+
# Perform comprehensive analysis
|
|
54
|
+
analyze_dataset(df)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Requirements
|
|
58
|
+
|
|
59
|
+
- Python >= 3.9
|
|
60
|
+
- pandas
|
|
61
|
+
- numpy
|
|
62
|
+
- dsr-utils
|
|
63
|
+
|
|
64
|
+
## License
|
|
65
|
+
|
|
66
|
+
MIT License - see LICENSE file for details
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# dsr-data-tools
|
|
2
|
+
|
|
3
|
+
Data analysis and exploration tools for exploratory data analysis (EDA).
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Dataset Analysis**: Comprehensive statistical summaries and data quality assessment
|
|
8
|
+
- **Data Exploration**: Tools for understanding data distributions, correlations, and patterns
|
|
9
|
+
- **Quality Metrics**: Missing value detection, data type analysis, and anomaly identification
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install dsr-data-tools
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Usage
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from dsr_data_tools import analyze_dataset
|
|
22
|
+
|
|
23
|
+
# Load your data
|
|
24
|
+
df = pd.read_csv('data.csv')
|
|
25
|
+
|
|
26
|
+
# Perform comprehensive analysis
|
|
27
|
+
analyze_dataset(df)
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Requirements
|
|
31
|
+
|
|
32
|
+
- Python >= 3.9
|
|
33
|
+
- pandas
|
|
34
|
+
- numpy
|
|
35
|
+
- dsr-utils
|
|
36
|
+
|
|
37
|
+
## License
|
|
38
|
+
|
|
39
|
+
MIT License - see LICENSE file for details
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dsr-data-tools"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "Generic data handling utilities including data splitting and analysis."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Scott Roberts", email="scottrdeveloper@gmail.com"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["data", "splitting", "analysis", "ml-data"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"dsr-utils>=0.0.1",
|
|
28
|
+
"numpy>=2.0.0",
|
|
29
|
+
"pandas>=2.0.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
dev = [
|
|
34
|
+
"pytest>=7.0",
|
|
35
|
+
"black>=23.0",
|
|
36
|
+
"ruff>=0.1.0",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[tool.setuptools]
|
|
40
|
+
packages = ["dsr_data_tools"]
|
|
41
|
+
|
|
42
|
+
[tool.setuptools.package-dir]
|
|
43
|
+
"" = "src"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
dsr_data_tools: Generic data handling utilities for data splitting and analysis.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from dsr_data_tools.analysis import (
|
|
6
|
+
DataframeColumn,
|
|
7
|
+
DataframeInfo,
|
|
8
|
+
analyze_column_data,
|
|
9
|
+
analyze_dataset,
|
|
10
|
+
)
|
|
11
|
+
from dsr_data_tools.recommendations import apply_recommendations
|
|
12
|
+
from dsr_data_tools.enums import (
|
|
13
|
+
RecommendationType,
|
|
14
|
+
EncodingStrategy,
|
|
15
|
+
MissingValueStrategy,
|
|
16
|
+
OutlierStrategy,
|
|
17
|
+
ImbalanceStrategy,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"DataframeColumn",
|
|
22
|
+
"DataframeInfo",
|
|
23
|
+
"analyze_column_data",
|
|
24
|
+
"analyze_dataset",
|
|
25
|
+
"apply_recommendations",
|
|
26
|
+
"RecommendationType",
|
|
27
|
+
"EncodingStrategy",
|
|
28
|
+
"MissingValueStrategy",
|
|
29
|
+
"OutlierStrategy",
|
|
30
|
+
"ImbalanceStrategy",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
__version__ = "0.0.1"
|
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import Type
|
|
4
|
+
from dsr_utils import strings
|
|
5
|
+
from dsr_data_tools.enums import (
|
|
6
|
+
RecommendationType,
|
|
7
|
+
EncodingStrategy,
|
|
8
|
+
MissingValueStrategy,
|
|
9
|
+
OutlierStrategy,
|
|
10
|
+
ImbalanceStrategy,
|
|
11
|
+
)
|
|
12
|
+
from dsr_data_tools.recommendations import (
|
|
13
|
+
Recommendation,
|
|
14
|
+
NonInformativeRecommendation,
|
|
15
|
+
MissingValuesRecommendation,
|
|
16
|
+
EncodingRecommendation,
|
|
17
|
+
ClassImbalanceRecommendation,
|
|
18
|
+
OutlierDetectionRecommendation,
|
|
19
|
+
BooleanClassificationRecommendation,
|
|
20
|
+
BinningRecommendation,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DataframeColumn:
|
|
25
|
+
"""Represents metadata for a single DataFrame column.
|
|
26
|
+
|
|
27
|
+
Stores column name, non-null count, and data type information for analysis
|
|
28
|
+
and display purposes.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
name (str): The column name.
|
|
32
|
+
non_null_count (int): Number of non-null values in the column.
|
|
33
|
+
data_type (Type): The pandas data type of the column.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> df = pd.DataFrame({'age': [25, 30, None, 35]})
|
|
37
|
+
>>> col = DataframeColumn('age', 3, float)
|
|
38
|
+
>>> col.name
|
|
39
|
+
'age'
|
|
40
|
+
>>> col.non_null_count
|
|
41
|
+
3
|
|
42
|
+
"""
|
|
43
|
+
@staticmethod
|
|
44
|
+
def dfc_list_from_df(df: pd.DataFrame) -> list[DataframeColumn]:
|
|
45
|
+
"""Create a list of DataframeColumn objects from a DataFrame.
|
|
46
|
+
|
|
47
|
+
Extracts column names, non-null counts, and data types from the DataFrame
|
|
48
|
+
and creates a DataframeColumn object for each column.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
df (pd.DataFrame): The DataFrame to extract column information from.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
list[DataframeColumn]: List of DataframeColumn objects, one per column.
|
|
55
|
+
|
|
56
|
+
Example:
|
|
57
|
+
>>> df = pd.DataFrame({'name': ['Alice', 'Bob'], 'age': [25, 30]})
|
|
58
|
+
>>> columns = DataframeColumn.dfc_list_from_df(df)
|
|
59
|
+
>>> len(columns)
|
|
60
|
+
2
|
|
61
|
+
"""
|
|
62
|
+
df_columns = df.columns.tolist()
|
|
63
|
+
df_non_null_count = df.count().tolist()
|
|
64
|
+
df_data_types = df.dtypes.tolist()
|
|
65
|
+
n = len(df_columns)
|
|
66
|
+
dfc_list = []
|
|
67
|
+
|
|
68
|
+
for c in range(n):
|
|
69
|
+
dfc = DataframeColumn(df_columns[c],
|
|
70
|
+
df_non_null_count[c],
|
|
71
|
+
df_data_types[c])
|
|
72
|
+
dfc_list.append(dfc)
|
|
73
|
+
|
|
74
|
+
return dfc_list
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
name: str,
|
|
79
|
+
non_null_count: int,
|
|
80
|
+
data_type: Type
|
|
81
|
+
):
|
|
82
|
+
self.__name = name
|
|
83
|
+
self.__non_null_count = non_null_count
|
|
84
|
+
self.__data_type = data_type
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def name(self) -> str:
|
|
88
|
+
return self.__name
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def non_null_count(self) -> int:
|
|
92
|
+
return self.__non_null_count
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def data_type(self) -> Type:
|
|
96
|
+
return self.__data_type
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class DataframeInfo:
|
|
100
|
+
"""Stores comprehensive information about a DataFrame's structure and content.
|
|
101
|
+
|
|
102
|
+
Provides a summary of DataFrame characteristics including row counts, duplicate
|
|
103
|
+
detection, and detailed column information. Used for data exploration and
|
|
104
|
+
quality assessment.
|
|
105
|
+
|
|
106
|
+
Attributes:
|
|
107
|
+
row_count (int): Total number of rows in the DataFrame.
|
|
108
|
+
duplicate_row_count (int): Number of duplicate rows detected.
|
|
109
|
+
columns (list[DataframeColumn]): List of column metadata objects.
|
|
110
|
+
|
|
111
|
+
Example:
|
|
112
|
+
>>> df = pd.DataFrame({
|
|
113
|
+
... 'name': ['Alice', 'Bob', 'Alice'],
|
|
114
|
+
... 'age': [25, 30, 25]
|
|
115
|
+
... })
|
|
116
|
+
>>> df_info = DataframeInfo(df)
|
|
117
|
+
>>> df_info.row_count
|
|
118
|
+
3
|
|
119
|
+
>>> df_info.duplicate_row_count
|
|
120
|
+
1
|
|
121
|
+
>>> len(df_info.columns)
|
|
122
|
+
2
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def __init__(
|
|
126
|
+
self,
|
|
127
|
+
df: pd.DataFrame
|
|
128
|
+
):
|
|
129
|
+
self.__row_count = len(df)
|
|
130
|
+
self.__duplicate_row_count = df.duplicated().sum()
|
|
131
|
+
self.__columns = DataframeColumn.dfc_list_from_df(df)
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def row_count(self) -> int:
|
|
135
|
+
return self.__row_count
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def duplicate_row_count(self) -> int:
|
|
139
|
+
return self.__duplicate_row_count
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def columns(self) -> list[DataframeColumn]:
|
|
143
|
+
return self.__columns
|
|
144
|
+
|
|
145
|
+
def info(self):
|
|
146
|
+
"""Display formatted summary of DataFrame information.
|
|
147
|
+
|
|
148
|
+
Prints row count, duplicate count, and a table showing column names,
|
|
149
|
+
non-null counts, and data types for all columns.
|
|
150
|
+
|
|
151
|
+
Example:
|
|
152
|
+
>>> df = pd.DataFrame({'name': ['Alice', 'Bob'], 'age': [25, 30]})
|
|
153
|
+
>>> df_info = DataframeInfo(df)
|
|
154
|
+
>>> df_info.info()
|
|
155
|
+
Rows: 2
|
|
156
|
+
Duplicate rows: 0
|
|
157
|
+
|
|
158
|
+
Column Non-null Data type
|
|
159
|
+
name 2 object
|
|
160
|
+
age 2 int64
|
|
161
|
+
"""
|
|
162
|
+
print(f'Rows: {self.row_count}')
|
|
163
|
+
print(f'Duplicate rows: {self.duplicate_row_count}')
|
|
164
|
+
print()
|
|
165
|
+
col_headers = ['Column', 'Non-null', 'Data type']
|
|
166
|
+
col_width = [15, 10, 12]
|
|
167
|
+
print(
|
|
168
|
+
f'{col_headers[0]:<{col_width[0]}}{col_headers[1]:>{col_width[1]}} {col_headers[2]:<{col_width[2]}}')
|
|
169
|
+
|
|
170
|
+
for c in self.columns:
|
|
171
|
+
print(
|
|
172
|
+
f'{c.name:<{col_width[0]}}{c.non_null_count:>{col_width[1]}} {c.data_type.name:<{col_width[2]}}')
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def generate_recommendations(
|
|
176
|
+
df: pd.DataFrame,
|
|
177
|
+
target_column: str | None = None
|
|
178
|
+
) -> dict[str, dict[str, Recommendation]]:
|
|
179
|
+
"""Generate data preparation recommendations for each column in a DataFrame.
|
|
180
|
+
|
|
181
|
+
Analyzes each column and generates appropriate recommendations based on
|
|
182
|
+
data characteristics (missing values, cardinality, data type, etc.).
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
df (pd.DataFrame): The DataFrame to analyze.
|
|
186
|
+
target_column (str | None): Name of the target column (for imbalance detection).
|
|
187
|
+
If provided, class imbalance will be analyzed for this column.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
dict[str, dict[str, Recommendation]]: Nested dictionary mapping column names
|
|
191
|
+
to recommendation types to Recommendation instances.
|
|
192
|
+
|
|
193
|
+
Example:
|
|
194
|
+
>>> df = pd.DataFrame({
|
|
195
|
+
... 'id': range(100),
|
|
196
|
+
... 'name': ['Alice'] * 100,
|
|
197
|
+
... 'age': [25] * 100 + [30] * 50,
|
|
198
|
+
... 'salary': [50000] * 50 + [100000] * 50
|
|
199
|
+
... })
|
|
200
|
+
>>> recs = generate_recommendations(df, target_column='name')
|
|
201
|
+
>>> recs['id'] # Non-informative (unique count == row count)
|
|
202
|
+
>>> recs['age']['encoding'] # Binary encoding recommendation
|
|
203
|
+
"""
|
|
204
|
+
recommendations: dict[str, dict[str, Recommendation]] = {}
|
|
205
|
+
|
|
206
|
+
for col_name in df.columns:
|
|
207
|
+
col_recommendations: dict[str, Recommendation] = {}
|
|
208
|
+
series = df[col_name]
|
|
209
|
+
|
|
210
|
+
# 1. Check for non-informative columns
|
|
211
|
+
unique_count = series.nunique()
|
|
212
|
+
total_rows = len(df)
|
|
213
|
+
is_numeric = pd.api.types.is_numeric_dtype(series)
|
|
214
|
+
|
|
215
|
+
# Non-informative: unique count equals total rows (e.g., ID column)
|
|
216
|
+
if unique_count == total_rows:
|
|
217
|
+
rec = NonInformativeRecommendation(
|
|
218
|
+
type=RecommendationType.NON_INFORMATIVE,
|
|
219
|
+
column_name=col_name,
|
|
220
|
+
description=f"Column '{col_name}' has unique value for each row.",
|
|
221
|
+
reason="Unique count equals row count"
|
|
222
|
+
)
|
|
223
|
+
col_recommendations['non_informative'] = rec
|
|
224
|
+
recommendations[col_name] = col_recommendations
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
# Non-informative: high cardinality object type (> 25% unique values)
|
|
228
|
+
if not is_numeric and unique_count > total_rows * 0.25:
|
|
229
|
+
rec = NonInformativeRecommendation(
|
|
230
|
+
type=RecommendationType.NON_INFORMATIVE,
|
|
231
|
+
column_name=col_name,
|
|
232
|
+
description=f"Column '{col_name}' has high cardinality ({unique_count} unique values).",
|
|
233
|
+
reason="High cardinality object type"
|
|
234
|
+
)
|
|
235
|
+
col_recommendations['non_informative'] = rec
|
|
236
|
+
recommendations[col_name] = col_recommendations
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
# 2. Check for missing values
|
|
240
|
+
missing_count = series.isna().sum()
|
|
241
|
+
if missing_count > 0:
|
|
242
|
+
missing_percentage = (missing_count / total_rows) * 100
|
|
243
|
+
|
|
244
|
+
# Determine strategy based on percentage
|
|
245
|
+
if missing_percentage < 10:
|
|
246
|
+
strategy = MissingValueStrategy.DROP_ROWS
|
|
247
|
+
elif missing_percentage > 50:
|
|
248
|
+
strategy = MissingValueStrategy.DROP_COLUMN
|
|
249
|
+
else:
|
|
250
|
+
strategy = MissingValueStrategy.IMPUTE
|
|
251
|
+
|
|
252
|
+
rec = MissingValuesRecommendation(
|
|
253
|
+
type=RecommendationType.MISSING_VALUES,
|
|
254
|
+
column_name=col_name,
|
|
255
|
+
description=f"Column '{col_name}' has {missing_count} missing values ({missing_percentage:.1f}%).",
|
|
256
|
+
missing_count=missing_count,
|
|
257
|
+
missing_percentage=missing_percentage,
|
|
258
|
+
strategy=strategy
|
|
259
|
+
)
|
|
260
|
+
col_recommendations['missing_values'] = rec
|
|
261
|
+
|
|
262
|
+
# 3. Check for boolean classification (exactly 2 unique numeric values)
|
|
263
|
+
# Skip target column as it should remain numeric for classifiers
|
|
264
|
+
if is_numeric and unique_count == 2 and col_name != target_column:
|
|
265
|
+
values = sorted(series.dropna().unique().tolist())
|
|
266
|
+
if values == [0.0, 1.0] or values == [0, 1]:
|
|
267
|
+
rec = BooleanClassificationRecommendation(
|
|
268
|
+
type=RecommendationType.BOOLEAN_CLASSIFICATION,
|
|
269
|
+
column_name=col_name,
|
|
270
|
+
description=f"Column '{col_name}' should be treated as boolean.",
|
|
271
|
+
values=values
|
|
272
|
+
)
|
|
273
|
+
col_recommendations['boolean_classification'] = rec
|
|
274
|
+
|
|
275
|
+
# 4. Check for encoding recommendations (categorical columns)
|
|
276
|
+
if not is_numeric and col_name != target_column:
|
|
277
|
+
# Binary categorical: 2 unique values
|
|
278
|
+
if unique_count == 2:
|
|
279
|
+
rec = EncodingRecommendation(
|
|
280
|
+
type=RecommendationType.ENCODING,
|
|
281
|
+
column_name=col_name,
|
|
282
|
+
description=f"Column '{col_name}' is binary categorical; recommend LabelEncoder.",
|
|
283
|
+
encoder_type=EncodingStrategy.LABEL,
|
|
284
|
+
unique_values=unique_count
|
|
285
|
+
)
|
|
286
|
+
col_recommendations['encoding'] = rec
|
|
287
|
+
|
|
288
|
+
# Multi-class categorical: 3-10 unique values
|
|
289
|
+
elif 3 <= unique_count <= 10:
|
|
290
|
+
rec = EncodingRecommendation(
|
|
291
|
+
type=RecommendationType.ENCODING,
|
|
292
|
+
column_name=col_name,
|
|
293
|
+
description=f"Column '{col_name}' is multi-class categorical; recommend OneHotEncoder.",
|
|
294
|
+
encoder_type=EncodingStrategy.ONEHOT,
|
|
295
|
+
unique_values=unique_count
|
|
296
|
+
)
|
|
297
|
+
col_recommendations['encoding'] = rec
|
|
298
|
+
|
|
299
|
+
# 5. Check for outliers (numeric columns)
|
|
300
|
+
if is_numeric:
|
|
301
|
+
mean_value = series.mean()
|
|
302
|
+
max_value = series.max()
|
|
303
|
+
min_value = series.min()
|
|
304
|
+
|
|
305
|
+
# Check if max value significantly exceeds mean (potential outliers)
|
|
306
|
+
if max_value > mean_value * 2: # Max is more than 2x the mean
|
|
307
|
+
rec = OutlierDetectionRecommendation(
|
|
308
|
+
type=RecommendationType.OUTLIER_DETECTION,
|
|
309
|
+
column_name=col_name,
|
|
310
|
+
description=f"Column '{col_name}' has potential outliers (max={max_value:.2f}, mean={mean_value:.2f}).",
|
|
311
|
+
strategy=OutlierStrategy.SCALING,
|
|
312
|
+
max_value=max_value,
|
|
313
|
+
mean_value=mean_value
|
|
314
|
+
)
|
|
315
|
+
col_recommendations['outlier_detection'] = rec
|
|
316
|
+
|
|
317
|
+
# 6. Check for class imbalance (target column)
|
|
318
|
+
if col_name == target_column and unique_count <= 2:
|
|
319
|
+
class_counts = series.value_counts()
|
|
320
|
+
max_class_percentage = (class_counts.max() / total_rows) * 100
|
|
321
|
+
|
|
322
|
+
if max_class_percentage > 70:
|
|
323
|
+
rec = ClassImbalanceRecommendation(
|
|
324
|
+
type=RecommendationType.CLASS_IMBALANCE,
|
|
325
|
+
column_name=col_name,
|
|
326
|
+
description=f"Target variable '{col_name}' shows class imbalance ({max_class_percentage:.1f}% majority class).",
|
|
327
|
+
majority_percentage=max_class_percentage,
|
|
328
|
+
strategy=ImbalanceStrategy.CLASS_WEIGHT
|
|
329
|
+
)
|
|
330
|
+
col_recommendations['class_imbalance'] = rec
|
|
331
|
+
|
|
332
|
+
# 7. Suggest binning for numeric columns (e.g., Age)
|
|
333
|
+
if is_numeric and col_name.lower() in ['age', 'years']:
|
|
334
|
+
# Use describe() percentiles to suggest bins
|
|
335
|
+
desc = series.describe()
|
|
336
|
+
bins = [series.min() - 1, desc['25%'], desc['50%'],
|
|
337
|
+
desc['75%'], series.max()]
|
|
338
|
+
labels = ['Low', 'Medium_Low', 'Medium_High', 'High']
|
|
339
|
+
|
|
340
|
+
rec = BinningRecommendation(
|
|
341
|
+
type=RecommendationType.BINNING,
|
|
342
|
+
column_name=col_name,
|
|
343
|
+
description=f"Column '{col_name}' could be binned into {len(labels)} categories.",
|
|
344
|
+
bins=bins,
|
|
345
|
+
labels=labels
|
|
346
|
+
)
|
|
347
|
+
col_recommendations['binning'] = rec
|
|
348
|
+
|
|
349
|
+
if col_recommendations:
|
|
350
|
+
recommendations[col_name] = col_recommendations
|
|
351
|
+
|
|
352
|
+
return recommendations
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def analyze_column_data(
|
|
356
|
+
series: pd.Series,
|
|
357
|
+
dataframe_column: DataframeColumn
|
|
358
|
+
):
|
|
359
|
+
"""Analyze and print detailed statistics for a single DataFrame column.
|
|
360
|
+
|
|
361
|
+
Displays column name, data type, null counts, unique values, min/max values.
|
|
362
|
+
For float columns, shows integer vs non-integer value counts. For object
|
|
363
|
+
columns, shows numeric vs non-numeric value counts.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
series (pd.Series): The data series to analyze.
|
|
367
|
+
dataframe_column (DataframeColumn): Metadata about the column.
|
|
368
|
+
|
|
369
|
+
Example:
|
|
370
|
+
>>> df = pd.DataFrame({'price': [10.5, 20.0, 30.99]})
|
|
371
|
+
>>> col = DataframeColumn('price', 3, float)
|
|
372
|
+
>>> analyze_column_data(df['price'], col)
|
|
373
|
+
# Prints detailed statistics
|
|
374
|
+
"""
|
|
375
|
+
series_length = len(series)
|
|
376
|
+
is_float_type = (dataframe_column.data_type.name == 'float64')
|
|
377
|
+
integer_analysis = ''
|
|
378
|
+
|
|
379
|
+
if is_float_type:
|
|
380
|
+
integer_value_count = series.apply(lambda x: x.is_integer()).sum()
|
|
381
|
+
non_integer_value_count = series_length - integer_value_count
|
|
382
|
+
integer_analysis = f"""Integer values: {integer_value_count}
|
|
383
|
+
Non-integer values: {non_integer_value_count}"""
|
|
384
|
+
|
|
385
|
+
is_object_data_type = (dataframe_column.data_type.name == 'object')
|
|
386
|
+
object_analysis = ''
|
|
387
|
+
|
|
388
|
+
if is_object_data_type:
|
|
389
|
+
numeric_value_count = series.str.isnumeric().sum()
|
|
390
|
+
non_numeric_value_count = series_length - \
|
|
391
|
+
series.apply(strings.is_float_string).sum()
|
|
392
|
+
|
|
393
|
+
object_analysis = f"""Numeric values: {numeric_value_count}
|
|
394
|
+
Non-numeric values: {non_numeric_value_count}"""
|
|
395
|
+
|
|
396
|
+
analysis = f"""
|
|
397
|
+
Column: {dataframe_column.name}
|
|
398
|
+
Data type: {dataframe_column.data_type.name}
|
|
399
|
+
Non-null: {dataframe_column.non_null_count}
|
|
400
|
+
N/A count: {series.isna().sum()}
|
|
401
|
+
Unique values: {series.nunique()}
|
|
402
|
+
Min value: {series.min()}
|
|
403
|
+
Max value: {series.max()}"""
|
|
404
|
+
|
|
405
|
+
print(analysis)
|
|
406
|
+
|
|
407
|
+
if is_float_type:
|
|
408
|
+
print(integer_analysis)
|
|
409
|
+
|
|
410
|
+
if is_object_data_type:
|
|
411
|
+
print(object_analysis)
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def analyze_dataset(
|
|
415
|
+
df: pd.DataFrame,
|
|
416
|
+
target_column: str | None = None,
|
|
417
|
+
generate_recs: bool = False
|
|
418
|
+
) -> tuple[DataframeInfo, dict[str, dict[str, Recommendation]] | None]:
|
|
419
|
+
"""Perform comprehensive analysis of all columns in a DataFrame.
|
|
420
|
+
|
|
421
|
+
Displays overall DataFrame information (row count, duplicates) followed by
|
|
422
|
+
detailed analysis of each column including data types, null counts, unique
|
|
423
|
+
values, and type-specific statistics. Optionally generates data preparation
|
|
424
|
+
recommendations.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
df (pd.DataFrame): The DataFrame to analyze.
|
|
428
|
+
target_column (str | None): Name of the target column (for recommendation generation).
|
|
429
|
+
generate_recs (bool): Whether to generate recommendations. Default is False.
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
tuple[DataframeInfo, dict | None]: A tuple containing:
|
|
433
|
+
- DataframeInfo object with structured DataFrame information
|
|
434
|
+
- Recommendations dict (or None if generate_recs is False)
|
|
435
|
+
|
|
436
|
+
Example:
|
|
437
|
+
>>> df = pd.DataFrame({
|
|
438
|
+
... 'name': ['Alice', 'Bob', 'Charlie'],
|
|
439
|
+
... 'age': [25, 30, 35],
|
|
440
|
+
... 'salary': [50000.0, 60000.5, 75000.0]
|
|
441
|
+
... })
|
|
442
|
+
>>> info, recs = analyze_dataset(df, generate_recs=True)
|
|
443
|
+
# Prints comprehensive analysis of all columns and returns recommendations
|
|
444
|
+
"""
|
|
445
|
+
df_info = DataframeInfo(df)
|
|
446
|
+
df_info.info()
|
|
447
|
+
|
|
448
|
+
n = len(df_info.columns)
|
|
449
|
+
|
|
450
|
+
recommendations = None
|
|
451
|
+
if generate_recs:
|
|
452
|
+
recommendations = generate_recommendations(df, target_column)
|
|
453
|
+
|
|
454
|
+
for c in range(n):
|
|
455
|
+
col = df_info.columns[c]
|
|
456
|
+
analyze_column_data(df[col.name], df_info.columns[c])
|
|
457
|
+
|
|
458
|
+
# Display recommendations for this column if available
|
|
459
|
+
if recommendations and col.name in recommendations:
|
|
460
|
+
col_recs = recommendations[col.name]
|
|
461
|
+
if col_recs:
|
|
462
|
+
print("\n Recommendations:")
|
|
463
|
+
for rec_type, recommendation in col_recs.items():
|
|
464
|
+
recommendation.info()
|
|
465
|
+
print()
|
|
466
|
+
|
|
467
|
+
return df_info, recommendations
|