dragon-ml-toolbox 19.12.1__tar.gz → 19.13.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.12.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-19.13.0}/PKG-INFO +1 -1
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_trainer.py +3 -3
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_data_exploration.py +38 -62
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_utilities.py +1 -1
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/pyproject.toml +1 -1
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/LICENSE +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/README.md +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ETL_cleaning.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ETL_engineering.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/IO_tools.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_chaining_inference.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_chaining_utilities.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_configuration.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_configuration_pytab.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_evaluation_captum.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_evaluation_multi.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_finalize_handler.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_inference.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_models.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_models_advanced.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_models_pytab.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_optimization.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_optimization_pareto.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_scaler.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_evaluation.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_inference.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_models.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_utilities.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_evaluation.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_inference.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_models.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_transformers.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/SQL.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ETL_cleaning.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ETL_engineering.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_GUI_tools.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_IO_tools.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_MICE_imputation.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_callbacks.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_chaining_inference.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_chaining_utilities.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_configuration.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_configuration_pytab.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_evaluation.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_evaluation_captum.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_evaluation_multi.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_finalize_handler.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_inference.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_models.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_models_advanced.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_models_pytab.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_optimization.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_optimization_pareto.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_scaler.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_evaluation.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_inference.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_models.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_utilities.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_evaluation.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_inference.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_models.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_transformers.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_PSO_optimization.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_SQL.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_VIF_factor.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/__init__.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_inference.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_learning.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_excel_handler.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_keys.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_logger.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_math_utilities.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_models_advanced_base.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_models_advanced_helpers.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_optimization_tools.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_path_manager.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_plot_fonts.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_schema.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_script_info.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_serde.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/constants.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/data_exploration.py +3 -3
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ensemble_inference.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/excel_handler.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/math_utilities.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/optimization_tools.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/path_manager.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/plot_fonts.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/schema.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/serde.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 19.
|
|
3
|
+
Version: 19.13.0
|
|
4
4
|
Summary: Complete pipelines and helper tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 19.
|
|
3
|
+
Version: 19.13.0
|
|
4
4
|
Summary: Complete pipelines and helper tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -663,7 +663,7 @@ class DragonTrainer(_BaseDragonTrainer):
|
|
|
663
663
|
Evaluates the model, routing to the correct evaluation function based on task `kind`.
|
|
664
664
|
|
|
665
665
|
Args:
|
|
666
|
-
model_checkpoint (
|
|
666
|
+
model_checkpoint (Path | "best" | "current"):
|
|
667
667
|
- Path to a valid checkpoint for the model. The state of the trained model will be overwritten in place.
|
|
668
668
|
- If 'best', the best checkpoint will be loaded if a DragonModelCheckpoint was provided. The state of the trained model will be overwritten in place.
|
|
669
669
|
- If 'current', use the current state of the trained model up the latest trained epoch.
|
|
@@ -1608,7 +1608,7 @@ class DragonDetectionTrainer(_BaseDragonTrainer):
|
|
|
1608
1608
|
|
|
1609
1609
|
Args:
|
|
1610
1610
|
save_dir (str | Path): Directory to save all reports and plots.
|
|
1611
|
-
model_checkpoint (
|
|
1611
|
+
model_checkpoint (Path | "best" | "current"):
|
|
1612
1612
|
- Path to a valid checkpoint for the model. The state of the trained model will be overwritten in place.
|
|
1613
1613
|
- If 'best', the best checkpoint will be loaded if a DragonModelCheckpoint was provided. The state of the trained model will be overwritten in place.
|
|
1614
1614
|
- If 'current', use the current state of the trained model up the latest trained epoch.
|
|
@@ -2046,7 +2046,7 @@ class DragonSequenceTrainer(_BaseDragonTrainer):
|
|
|
2046
2046
|
Evaluates the model, routing to the correct evaluation function.
|
|
2047
2047
|
|
|
2048
2048
|
Args:
|
|
2049
|
-
model_checkpoint (
|
|
2049
|
+
model_checkpoint (Path | "best" | "current"):
|
|
2050
2050
|
- Path to a valid checkpoint for the model.
|
|
2051
2051
|
- If 'best', the best checkpoint will be loaded.
|
|
2052
2052
|
- If 'current', use the current state of the trained model.
|
|
@@ -17,7 +17,6 @@ from ._schema import FeatureSchema
|
|
|
17
17
|
_LOGGER = get_logger("Data Exploration")
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
# Keep track of all available tools, show using `info()`
|
|
21
20
|
__all__ = [
|
|
22
21
|
"summarize_dataframe",
|
|
23
22
|
"drop_constant_columns",
|
|
@@ -29,19 +28,19 @@ __all__ = [
|
|
|
29
28
|
"plot_value_distributions",
|
|
30
29
|
"plot_continuous_vs_target",
|
|
31
30
|
"plot_categorical_vs_target",
|
|
32
|
-
"encode_categorical_features",
|
|
33
31
|
"split_features_targets",
|
|
34
|
-
"
|
|
32
|
+
"encode_categorical_features",
|
|
35
33
|
"clip_outliers_single",
|
|
36
34
|
"clip_outliers_multi",
|
|
37
35
|
"drop_outlier_samples",
|
|
38
36
|
"plot_correlation_heatmap",
|
|
37
|
+
"finalize_feature_schema",
|
|
39
38
|
"match_and_filter_columns_by_regex",
|
|
40
39
|
"standardize_percentages",
|
|
41
40
|
"reconstruct_one_hot",
|
|
42
41
|
"reconstruct_binary",
|
|
43
42
|
"reconstruct_multibinary",
|
|
44
|
-
"
|
|
43
|
+
"split_continuous_binary",
|
|
45
44
|
"apply_feature_schema"
|
|
46
45
|
]
|
|
47
46
|
|
|
@@ -109,22 +108,17 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
|
|
|
109
108
|
for col_name in df_clean.columns:
|
|
110
109
|
column = df_clean[col_name]
|
|
111
110
|
|
|
112
|
-
# We can apply this logic to all columns or only focus on numeric ones.
|
|
113
|
-
# if not is_numeric_dtype(column):
|
|
114
|
-
# cols_to_keep.append(col_name)
|
|
115
|
-
# continue
|
|
116
|
-
|
|
117
111
|
# Keep a column if it has more than one unique value (nunique ignores NaNs by default)
|
|
118
112
|
if column.nunique(dropna=True) > 1:
|
|
119
113
|
cols_to_keep.append(col_name)
|
|
120
114
|
|
|
121
115
|
dropped_columns = original_columns - set(cols_to_keep)
|
|
122
116
|
if verbose:
|
|
123
|
-
_LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns.")
|
|
124
117
|
if dropped_columns:
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
118
|
+
_LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
|
|
119
|
+
else:
|
|
120
|
+
_LOGGER.info("No constant columns found.")
|
|
121
|
+
|
|
128
122
|
# Return a new DataFrame with only the columns to keep
|
|
129
123
|
df_clean = df_clean[cols_to_keep]
|
|
130
124
|
|
|
@@ -339,8 +333,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
339
333
|
cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
|
|
340
334
|
|
|
341
335
|
if len(cols_to_drop) > 0:
|
|
342
|
-
_LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
343
|
-
print(list(cols_to_drop))
|
|
336
|
+
_LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
|
|
344
337
|
|
|
345
338
|
result_df = df.drop(columns=cols_to_drop)
|
|
346
339
|
if show_nulls_after:
|
|
@@ -370,9 +363,8 @@ def drop_macro(df: pd.DataFrame,
|
|
|
370
363
|
|
|
371
364
|
Args:
|
|
372
365
|
df (pd.DataFrame): The input pandas DataFrame to be cleaned.
|
|
373
|
-
log_directory (Union[str, Path]): Path to the directory where the
|
|
374
|
-
|
|
375
|
-
will be saved.
|
|
366
|
+
log_directory (Union[str, Path]): Path to the directory where the missing data reports
|
|
367
|
+
and plots will be saved inside a "Missing Report" subdirectory.
|
|
376
368
|
targets (list[str]): A list of column names to be treated as target
|
|
377
369
|
variables. This list guides the row-dropping logic.
|
|
378
370
|
skip_targets (bool, optional): If True, the columns listed in `targets`
|
|
@@ -388,15 +380,18 @@ def drop_macro(df: pd.DataFrame,
|
|
|
388
380
|
# make a deep copy to work with
|
|
389
381
|
df_clean = df.copy()
|
|
390
382
|
|
|
383
|
+
base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
|
|
384
|
+
full_path = base_dir_path / "Missing Report"
|
|
385
|
+
|
|
391
386
|
# Log initial state + Plot
|
|
392
387
|
missing_data_start = show_null_columns(
|
|
393
388
|
df=df_clean,
|
|
394
|
-
plot_to_dir=
|
|
389
|
+
plot_to_dir=full_path,
|
|
395
390
|
plot_filename="Original",
|
|
396
391
|
use_all_columns=True
|
|
397
392
|
)
|
|
398
393
|
save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
|
|
399
|
-
save_dir=
|
|
394
|
+
save_dir=full_path,
|
|
400
395
|
filename="Missing_Data_Original")
|
|
401
396
|
|
|
402
397
|
# Clean cycles for rows and columns
|
|
@@ -425,12 +420,12 @@ def drop_macro(df: pd.DataFrame,
|
|
|
425
420
|
# log final state + plot
|
|
426
421
|
missing_data_final = show_null_columns(
|
|
427
422
|
df=df_clean,
|
|
428
|
-
plot_to_dir=
|
|
423
|
+
plot_to_dir=full_path,
|
|
429
424
|
plot_filename="Processed",
|
|
430
425
|
use_all_columns=True
|
|
431
426
|
)
|
|
432
427
|
save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
|
|
433
|
-
save_dir=
|
|
428
|
+
save_dir=full_path,
|
|
434
429
|
filename="Missing_Data_Processed")
|
|
435
430
|
|
|
436
431
|
# return cleaned dataframe
|
|
@@ -477,9 +472,8 @@ def plot_value_distributions(
|
|
|
477
472
|
df: pd.DataFrame,
|
|
478
473
|
save_dir: Union[str, Path],
|
|
479
474
|
categorical_columns: Optional[List[str]] = None,
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
fill_na_with: str = "Missing"
|
|
475
|
+
max_categories: int = 100,
|
|
476
|
+
fill_na_with: str = "MISSING DATA"
|
|
483
477
|
):
|
|
484
478
|
"""
|
|
485
479
|
Plots and saves the value distributions for all columns in a DataFrame,
|
|
@@ -492,15 +486,9 @@ def plot_value_distributions(
|
|
|
492
486
|
Args:
|
|
493
487
|
df (pd.DataFrame): The input DataFrame to analyze.
|
|
494
488
|
save_dir (str | Path): Directory path to save the plots.
|
|
495
|
-
categorical_columns (List[str] | None): If provided,
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
categorical_cardinality_threshold (int): A numeric column will be treated
|
|
499
|
-
as 'categorical' if its number of unique values is less than or equal to this threshold. (Ignored if `categorical_columns` is set).
|
|
500
|
-
max_categories (int): The maximum number of unique categories a
|
|
501
|
-
categorical feature can have to be plotted. Features exceeding this limit will be skipped.
|
|
502
|
-
fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its
|
|
503
|
-
own category. Defaults to "Missing".
|
|
489
|
+
categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
|
|
490
|
+
max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
|
|
491
|
+
fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
|
|
504
492
|
|
|
505
493
|
Notes:
|
|
506
494
|
- `seaborn.histplot` with KDE is used for continuous features.
|
|
@@ -535,7 +523,7 @@ def plot_value_distributions(
|
|
|
535
523
|
is_continuous = True
|
|
536
524
|
else:
|
|
537
525
|
# Use auto-detection
|
|
538
|
-
if is_numeric
|
|
526
|
+
if is_numeric:
|
|
539
527
|
is_continuous = True
|
|
540
528
|
|
|
541
529
|
# --- Case 1: Continuous Numeric (Histogram) ---
|
|
@@ -550,7 +538,7 @@ def plot_value_distributions(
|
|
|
550
538
|
save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
|
|
551
539
|
numeric_plots_saved += 1
|
|
552
540
|
|
|
553
|
-
# --- Case 2: Categorical
|
|
541
|
+
# --- Case 2: Categorical (Count Plot) ---
|
|
554
542
|
else:
|
|
555
543
|
# Check max categories
|
|
556
544
|
if n_unique > max_categories:
|
|
@@ -559,7 +547,7 @@ def plot_value_distributions(
|
|
|
559
547
|
|
|
560
548
|
# Adaptive figure size
|
|
561
549
|
fig_width = max(10, n_unique * 0.5)
|
|
562
|
-
plt.figure(figsize=(fig_width,
|
|
550
|
+
plt.figure(figsize=(fig_width, 8))
|
|
563
551
|
|
|
564
552
|
# Make a temporary copy for plotting to handle NaNs
|
|
565
553
|
temp_series = df[col_name].copy()
|
|
@@ -574,7 +562,7 @@ def plot_value_distributions(
|
|
|
574
562
|
|
|
575
563
|
# Get category order by frequency
|
|
576
564
|
order = temp_series.value_counts().index
|
|
577
|
-
sns.countplot(x=temp_series, order=order, palette="
|
|
565
|
+
sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
|
|
578
566
|
|
|
579
567
|
plt.title(f"Distribution of '{col_name}' (Categorical)")
|
|
580
568
|
plt.xlabel(col_name)
|
|
@@ -744,23 +732,23 @@ def plot_categorical_vs_target(
|
|
|
744
732
|
targets: List[str],
|
|
745
733
|
save_dir: Union[str, Path],
|
|
746
734
|
features: Optional[List[str]] = None,
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
fill_na_with: str = "Missing"
|
|
735
|
+
max_categories: int = 50,
|
|
736
|
+
fill_na_with: str = "MISSING DATA"
|
|
750
737
|
):
|
|
751
738
|
"""
|
|
752
|
-
Plots each categorical feature against each numeric target using box
|
|
739
|
+
Plots each categorical feature against each numeric target using box plots.
|
|
753
740
|
|
|
754
741
|
This function is a core EDA step for regression tasks to understand the
|
|
755
742
|
relationship between a categorical independent variable and a continuous
|
|
756
743
|
dependent variable.
|
|
744
|
+
|
|
745
|
+
Plots are saved as individual .svg files in a structured way, with a subdirectory created for each target.
|
|
757
746
|
|
|
758
747
|
Args:
|
|
759
748
|
df (pd.DataFrame): The input DataFrame.
|
|
760
749
|
targets (List[str]): A list of numeric target column names (y-axis).
|
|
761
750
|
save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
|
|
762
751
|
features (List[str] | None): A list of categorical feature column names (x-axis). If None, all non-numeric (object) columns will be used.
|
|
763
|
-
plot_type (Literal["box", "violin"]): The type of plot to generate.
|
|
764
752
|
max_categories (int): The maximum number of unique categories a feature can have to be plotted. Features exceeding this limit will be skipped.
|
|
765
753
|
fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category. Defaults to "Missing".
|
|
766
754
|
|
|
@@ -770,10 +758,6 @@ def plot_categorical_vs_target(
|
|
|
770
758
|
"""
|
|
771
759
|
# 1. Validate the base save directory and inputs
|
|
772
760
|
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
773
|
-
|
|
774
|
-
if plot_type not in ["box", "violin"]:
|
|
775
|
-
_LOGGER.error(f"Invalid plot type '{plot_type}'")
|
|
776
|
-
raise ValueError()
|
|
777
761
|
|
|
778
762
|
# 2. Validate target columns (must be numeric)
|
|
779
763
|
valid_targets = []
|
|
@@ -796,14 +780,10 @@ def plot_categorical_vs_target(
|
|
|
796
780
|
for col in df.columns:
|
|
797
781
|
if col in valid_targets:
|
|
798
782
|
continue
|
|
799
|
-
|
|
800
783
|
# Auto-include object dtypes
|
|
801
784
|
if is_object_dtype(df[col]):
|
|
802
785
|
features_to_plot.append(col)
|
|
803
|
-
|
|
804
|
-
# elif is_numeric_dtype(df[col]) and df[col].nunique() <= max_categories:
|
|
805
|
-
# _LOGGER.info(f"Treating low-cardinality numeric column '{col}' as categorical.")
|
|
806
|
-
# features_to_plot.append(col)
|
|
786
|
+
|
|
807
787
|
else:
|
|
808
788
|
# Validate user-provided list
|
|
809
789
|
for col in features:
|
|
@@ -821,12 +801,11 @@ def plot_categorical_vs_target(
|
|
|
821
801
|
|
|
822
802
|
for target_name in valid_targets:
|
|
823
803
|
# Create a sanitized subdirectory for this target
|
|
824
|
-
safe_target_dir_name = sanitize_filename(f"{target_name}
|
|
804
|
+
safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
|
|
825
805
|
target_save_dir = base_save_path / safe_target_dir_name
|
|
826
806
|
target_save_dir.mkdir(parents=True, exist_ok=True)
|
|
827
807
|
|
|
828
|
-
_LOGGER.info(f"Generating
|
|
829
|
-
|
|
808
|
+
_LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
|
|
830
809
|
for feature_name in features_to_plot:
|
|
831
810
|
|
|
832
811
|
# Make a temporary copy for plotting to handle NaNs and dtypes
|
|
@@ -848,12 +827,9 @@ def plot_categorical_vs_target(
|
|
|
848
827
|
|
|
849
828
|
# 5. Create the plot
|
|
850
829
|
# Increase figure width for categories
|
|
851
|
-
plt.figure(figsize=(max(10, n_unique * 1.2),
|
|
830
|
+
plt.figure(figsize=(max(10, n_unique * 1.2), 10))
|
|
852
831
|
|
|
853
|
-
|
|
854
|
-
sns.boxplot(x=feature_name, y=target_name, data=temp_df)
|
|
855
|
-
elif plot_type == "violin":
|
|
856
|
-
sns.violinplot(x=feature_name, y=target_name, data=temp_df)
|
|
832
|
+
sns.boxplot(x=feature_name, y=target_name, data=temp_df)
|
|
857
833
|
|
|
858
834
|
plt.title(f'{target_name} vs {feature_name}')
|
|
859
835
|
plt.xlabel(feature_name)
|
|
@@ -981,7 +957,7 @@ def encode_categorical_features(
|
|
|
981
957
|
|
|
982
958
|
# Handle the dataset splitting logic
|
|
983
959
|
if split_resulting_dataset:
|
|
984
|
-
df_categorical = df_encoded[valid_columns]
|
|
960
|
+
df_categorical = df_encoded[valid_columns]
|
|
985
961
|
df_non_categorical = df.drop(columns=valid_columns)
|
|
986
962
|
return mappings, df_non_categorical, df_categorical
|
|
987
963
|
else:
|
|
@@ -1167,7 +1143,7 @@ def clip_outliers_single(
|
|
|
1167
1143
|
|
|
1168
1144
|
def clip_outliers_multi(
|
|
1169
1145
|
df: pd.DataFrame,
|
|
1170
|
-
clip_dict: Dict[str, Tuple[
|
|
1146
|
+
clip_dict: Union[Dict[str, Tuple[int, int]], Dict[str, Tuple[float, float]]],
|
|
1171
1147
|
verbose: bool=False
|
|
1172
1148
|
) -> pd.DataFrame:
|
|
1173
1149
|
"""
|
|
@@ -396,7 +396,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
|
|
|
396
396
|
if not isinstance(full_path, Path) or not full_path.suffix.endswith(".csv"):
|
|
397
397
|
_LOGGER.error('A path object pointing to a .csv file must be provided.')
|
|
398
398
|
raise ValueError()
|
|
399
|
-
|
|
399
|
+
|
|
400
400
|
save_dataframe_filename(df=df,
|
|
401
401
|
save_dir=full_path.parent,
|
|
402
402
|
filename=full_path.name)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "dragon-ml-toolbox"
|
|
3
|
-
version = "19.
|
|
3
|
+
version = "19.13.0"
|
|
4
4
|
description = "Complete pipelines and helper tools for data science and machine learning projects."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Karl Luigi Loza Vidaurre", email = "luigiloza@gmail.com" }
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/requires.txt
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_datasetmaster.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_chaining_inference.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_chaining_utilities.py
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_configuration_pytab.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_evaluation_captum.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_evaluation_multi.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_finalize_handler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_models_advanced.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_optimization_pareto.py
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_datasetmaster.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_evaluation.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_inference.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_models.py
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_datasetmaster.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_evaluation.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_inference.py
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_transformers.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_evaluation.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_inference.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_learning.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_models_advanced_base.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_models_advanced_helpers.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_optimization_tools.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -37,18 +37,18 @@ __all__ = [
|
|
|
37
37
|
"plot_value_distributions",
|
|
38
38
|
"plot_continuous_vs_target",
|
|
39
39
|
"plot_categorical_vs_target",
|
|
40
|
-
"encode_categorical_features",
|
|
41
40
|
"split_features_targets",
|
|
42
|
-
"
|
|
41
|
+
"encode_categorical_features",
|
|
43
42
|
"clip_outliers_single",
|
|
44
43
|
"clip_outliers_multi",
|
|
45
44
|
"drop_outlier_samples",
|
|
46
45
|
"plot_correlation_heatmap",
|
|
46
|
+
"finalize_feature_schema",
|
|
47
47
|
"match_and_filter_columns_by_regex",
|
|
48
48
|
"standardize_percentages",
|
|
49
49
|
"reconstruct_one_hot",
|
|
50
50
|
"reconstruct_binary",
|
|
51
51
|
"reconstruct_multibinary",
|
|
52
|
-
"
|
|
52
|
+
"split_continuous_binary",
|
|
53
53
|
"apply_feature_schema",
|
|
54
54
|
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|