dragon-ml-toolbox 3.5.0__tar.gz → 3.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-3.5.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.6.0}/PKG-INFO +1 -1
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/ETL_engineering.py +5 -6
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/data_exploration.py +22 -17
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/pyproject.toml +1 -1
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/LICENSE +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/README.md +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/ML_tutorial.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/_particle_swarm_optimization.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/_pytorch_models.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/datasetmaster.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/logger.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/setup.cfg +0 -0
|
@@ -48,13 +48,12 @@ class ColumnCleaner:
|
|
|
48
48
|
## Usage Example
|
|
49
49
|
|
|
50
50
|
```python
|
|
51
|
-
|
|
52
|
-
# Matches
|
|
53
|
-
r'
|
|
51
|
+
id_rules = {
|
|
52
|
+
# Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
|
|
53
|
+
r'ID[- ](\d+)': r'ID:$1'
|
|
54
54
|
}
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
|
|
56
|
+
id_cleaner = ColumnCleaner(column_name='user_id', rules=id_rules)
|
|
58
57
|
# This object would then be passed to a DataFrameCleaner.
|
|
59
58
|
```
|
|
60
59
|
"""
|
|
@@ -529,7 +528,7 @@ class KeywordDummifier:
|
|
|
529
528
|
|
|
530
529
|
categorize_expr = categorize_expr.otherwise(None).alias("category")
|
|
531
530
|
|
|
532
|
-
temp_df = pl.
|
|
531
|
+
temp_df = pl.select(categorize_expr)
|
|
533
532
|
df_with_dummies = temp_df.to_dummies(columns=["category"])
|
|
534
533
|
|
|
535
534
|
final_columns = []
|
|
@@ -15,7 +15,7 @@ import re
|
|
|
15
15
|
# Keep track of all available tools, show using `info()`
|
|
16
16
|
__all__ = [
|
|
17
17
|
"summarize_dataframe",
|
|
18
|
-
"
|
|
18
|
+
"drop_constant_columns",
|
|
19
19
|
"drop_rows_with_missing_data",
|
|
20
20
|
"split_features_targets",
|
|
21
21
|
"show_null_columns",
|
|
@@ -62,44 +62,49 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
|
62
62
|
return summary
|
|
63
63
|
|
|
64
64
|
|
|
65
|
-
def
|
|
65
|
+
def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
|
|
66
66
|
"""
|
|
67
|
-
Removes columns from a pandas DataFrame that contain only
|
|
67
|
+
Removes columns from a pandas DataFrame that contain only a single unique
|
|
68
|
+
value or are entirely null/NaN.
|
|
68
69
|
|
|
69
|
-
This utility is useful for cleaning data
|
|
70
|
+
This utility is useful for cleaning data by removing constant features that
|
|
71
|
+
have no predictive value.
|
|
70
72
|
|
|
71
73
|
Args:
|
|
72
74
|
df (pd.DataFrame):
|
|
73
75
|
The pandas DataFrame to clean.
|
|
76
|
+
verbose (bool):
|
|
77
|
+
If True, prints the names of the columns that were dropped.
|
|
78
|
+
Defaults to True.
|
|
74
79
|
|
|
75
80
|
Returns:
|
|
76
81
|
pd.DataFrame:
|
|
77
|
-
A new DataFrame with the
|
|
82
|
+
A new DataFrame with the constant columns removed.
|
|
78
83
|
"""
|
|
79
84
|
if not isinstance(df, pd.DataFrame):
|
|
80
85
|
raise TypeError("Input must be a pandas DataFrame.")
|
|
81
|
-
|
|
86
|
+
|
|
82
87
|
original_columns = set(df.columns)
|
|
83
|
-
|
|
84
88
|
cols_to_keep = []
|
|
89
|
+
|
|
85
90
|
for col_name in df.columns:
|
|
86
91
|
column = df[col_name]
|
|
87
92
|
|
|
88
|
-
#
|
|
89
|
-
if not is_numeric_dtype(column):
|
|
93
|
+
# We can apply this logic to all columns or only focus on numeric ones.
|
|
94
|
+
# if not is_numeric_dtype(column):
|
|
95
|
+
# cols_to_keep.append(col_name)
|
|
96
|
+
# continue
|
|
97
|
+
|
|
98
|
+
# Keep a column if it has more than one unique value (nunique ignores NaNs by default)
|
|
99
|
+
if column.nunique(dropna=True) > 1:
|
|
90
100
|
cols_to_keep.append(col_name)
|
|
91
|
-
continue
|
|
92
101
|
|
|
93
|
-
|
|
94
|
-
if (column != 0).any():
|
|
95
|
-
cols_to_keep.append(col_name)
|
|
96
|
-
|
|
97
|
-
dropped_columns = original_columns - set(cols_to_keep)
|
|
102
|
+
dropped_columns = original_columns - set(cols_to_keep)
|
|
98
103
|
if dropped_columns and verbose:
|
|
99
|
-
print(f"Dropped {len(dropped_columns)} columns:")
|
|
104
|
+
print(f"Dropped {len(dropped_columns)} constant columns:")
|
|
100
105
|
for dropped_column in dropped_columns:
|
|
101
106
|
print(f" {dropped_column}")
|
|
102
|
-
|
|
107
|
+
|
|
103
108
|
return df[cols_to_keep]
|
|
104
109
|
|
|
105
110
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/dragon_ml_toolbox.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-3.5.0 → dragon_ml_toolbox-3.6.0}/ml_tools/_particle_swarm_optimization.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|