dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
- dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
- ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
- ml_tools/ETL_cleaning/_basic_clean.py +351 -0
- ml_tools/ETL_cleaning/_clean_tools.py +128 -0
- ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
- ml_tools/ETL_cleaning/_imprimir.py +13 -0
- ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
- ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
- ml_tools/ETL_engineering/_imprimir.py +24 -0
- ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
- ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
- ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
- ml_tools/GUI_tools/_imprimir.py +12 -0
- ml_tools/IO_tools/_IO_loggers.py +235 -0
- ml_tools/IO_tools/_IO_save_load.py +151 -0
- ml_tools/IO_tools/_IO_utils.py +140 -0
- ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
- ml_tools/IO_tools/_imprimir.py +14 -0
- ml_tools/MICE/_MICE_imputation.py +132 -0
- ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
- ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
- ml_tools/MICE/_imprimir.py +11 -0
- ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
- ml_tools/ML_callbacks/_base.py +101 -0
- ml_tools/ML_callbacks/_checkpoint.py +232 -0
- ml_tools/ML_callbacks/_early_stop.py +208 -0
- ml_tools/ML_callbacks/_imprimir.py +12 -0
- ml_tools/ML_callbacks/_scheduler.py +197 -0
- ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
- ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
- ml_tools/ML_chain/_dragon_chain.py +140 -0
- ml_tools/ML_chain/_imprimir.py +11 -0
- ml_tools/ML_configuration/__init__.py +90 -0
- ml_tools/ML_configuration/_base_model_config.py +69 -0
- ml_tools/ML_configuration/_finalize.py +366 -0
- ml_tools/ML_configuration/_imprimir.py +47 -0
- ml_tools/ML_configuration/_metrics.py +593 -0
- ml_tools/ML_configuration/_models.py +206 -0
- ml_tools/ML_configuration/_training.py +124 -0
- ml_tools/ML_datasetmaster/__init__.py +28 -0
- ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
- ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
- ml_tools/ML_datasetmaster/_imprimir.py +15 -0
- ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
- ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
- ml_tools/ML_evaluation/__init__.py +53 -0
- ml_tools/ML_evaluation/_classification.py +629 -0
- ml_tools/ML_evaluation/_feature_importance.py +409 -0
- ml_tools/ML_evaluation/_imprimir.py +25 -0
- ml_tools/ML_evaluation/_loss.py +92 -0
- ml_tools/ML_evaluation/_regression.py +273 -0
- ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
- ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
- ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
- ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
- ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
- ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
- ml_tools/ML_finalize_handler/__init__.py +10 -0
- ml_tools/ML_finalize_handler/_imprimir.py +8 -0
- ml_tools/ML_inference/__init__.py +22 -0
- ml_tools/ML_inference/_base_inference.py +166 -0
- ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
- ml_tools/ML_inference/_dragon_inference.py +332 -0
- ml_tools/ML_inference/_imprimir.py +11 -0
- ml_tools/ML_inference/_multi_inference.py +180 -0
- ml_tools/ML_inference_sequence/__init__.py +10 -0
- ml_tools/ML_inference_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
- ml_tools/ML_inference_vision/__init__.py +10 -0
- ml_tools/ML_inference_vision/_imprimir.py +8 -0
- ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
- ml_tools/ML_models/__init__.py +32 -0
- ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
- ml_tools/ML_models/_base_mlp_attention.py +198 -0
- ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
- ml_tools/ML_models/_dragon_tabular.py +248 -0
- ml_tools/ML_models/_imprimir.py +18 -0
- ml_tools/ML_models/_mlp_attention.py +134 -0
- ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
- ml_tools/ML_models_sequence/__init__.py +10 -0
- ml_tools/ML_models_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
- ml_tools/ML_models_vision/__init__.py +29 -0
- ml_tools/ML_models_vision/_base_wrapper.py +254 -0
- ml_tools/ML_models_vision/_image_classification.py +182 -0
- ml_tools/ML_models_vision/_image_segmentation.py +108 -0
- ml_tools/ML_models_vision/_imprimir.py +16 -0
- ml_tools/ML_models_vision/_object_detection.py +135 -0
- ml_tools/ML_optimization/__init__.py +21 -0
- ml_tools/ML_optimization/_imprimir.py +13 -0
- ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
- ml_tools/ML_optimization/_single_dragon.py +203 -0
- ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
- ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
- ml_tools/ML_scaler/__init__.py +10 -0
- ml_tools/ML_scaler/_imprimir.py +8 -0
- ml_tools/ML_trainer/__init__.py +20 -0
- ml_tools/ML_trainer/_base_trainer.py +297 -0
- ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
- ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
- ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
- ml_tools/ML_trainer/_imprimir.py +10 -0
- ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
- ml_tools/ML_utilities/_artifact_finder.py +382 -0
- ml_tools/ML_utilities/_imprimir.py +16 -0
- ml_tools/ML_utilities/_inspection.py +325 -0
- ml_tools/ML_utilities/_train_tools.py +205 -0
- ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
- ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
- ml_tools/ML_vision_transformers/_imprimir.py +14 -0
- ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
- ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
- ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
- ml_tools/PSO_optimization/_imprimir.py +10 -0
- ml_tools/SQL/__init__.py +7 -0
- ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
- ml_tools/SQL/_imprimir.py +8 -0
- ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
- ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
- ml_tools/VIF/_imprimir.py +10 -0
- ml_tools/_core/__init__.py +7 -1
- ml_tools/_core/_logger.py +8 -18
- ml_tools/_core/_schema_load_ops.py +43 -0
- ml_tools/_core/_script_info.py +2 -2
- ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
- ml_tools/data_exploration/_analysis.py +214 -0
- ml_tools/data_exploration/_cleaning.py +566 -0
- ml_tools/data_exploration/_features.py +583 -0
- ml_tools/data_exploration/_imprimir.py +32 -0
- ml_tools/data_exploration/_plotting.py +487 -0
- ml_tools/data_exploration/_schema_ops.py +176 -0
- ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
- ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
- ml_tools/ensemble_evaluation/_imprimir.py +14 -0
- ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
- ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
- ml_tools/ensemble_inference/_imprimir.py +9 -0
- ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
- ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
- ml_tools/ensemble_learning/_imprimir.py +10 -0
- ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
- ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
- ml_tools/excel_handler/_imprimir.py +13 -0
- ml_tools/{keys.py → keys/__init__.py} +4 -1
- ml_tools/keys/_imprimir.py +11 -0
- ml_tools/{_core → keys}/_keys.py +2 -0
- ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
- ml_tools/math_utilities/_imprimir.py +11 -0
- ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
- ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
- ml_tools/optimization_tools/_imprimir.py +13 -0
- ml_tools/optimization_tools/_optimization_bounds.py +236 -0
- ml_tools/optimization_tools/_optimization_plots.py +218 -0
- ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
- ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
- ml_tools/path_manager/_imprimir.py +15 -0
- ml_tools/path_manager/_path_tools.py +346 -0
- ml_tools/plot_fonts/__init__.py +8 -0
- ml_tools/plot_fonts/_imprimir.py +8 -0
- ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
- ml_tools/schema/__init__.py +15 -0
- ml_tools/schema/_feature_schema.py +223 -0
- ml_tools/schema/_gui_schema.py +191 -0
- ml_tools/schema/_imprimir.py +10 -0
- ml_tools/{serde.py → serde/__init__.py} +4 -2
- ml_tools/serde/_imprimir.py +10 -0
- ml_tools/{_core → serde}/_serde.py +3 -8
- ml_tools/{utilities.py → utilities/__init__.py} +11 -6
- ml_tools/utilities/_imprimir.py +18 -0
- ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
- ml_tools/utilities/_utility_tools.py +192 -0
- dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
- ml_tools/ML_chaining_inference.py +0 -8
- ml_tools/ML_configuration.py +0 -86
- ml_tools/ML_configuration_pytab.py +0 -14
- ml_tools/ML_datasetmaster.py +0 -10
- ml_tools/ML_evaluation.py +0 -16
- ml_tools/ML_evaluation_multi.py +0 -12
- ml_tools/ML_finalize_handler.py +0 -8
- ml_tools/ML_inference.py +0 -12
- ml_tools/ML_models.py +0 -14
- ml_tools/ML_models_advanced.py +0 -14
- ml_tools/ML_models_pytab.py +0 -14
- ml_tools/ML_optimization.py +0 -14
- ml_tools/ML_optimization_pareto.py +0 -8
- ml_tools/ML_scaler.py +0 -8
- ml_tools/ML_sequence_datasetmaster.py +0 -8
- ml_tools/ML_sequence_evaluation.py +0 -10
- ml_tools/ML_sequence_inference.py +0 -8
- ml_tools/ML_sequence_models.py +0 -8
- ml_tools/ML_trainer.py +0 -12
- ml_tools/ML_vision_datasetmaster.py +0 -12
- ml_tools/ML_vision_evaluation.py +0 -10
- ml_tools/ML_vision_inference.py +0 -8
- ml_tools/ML_vision_models.py +0 -18
- ml_tools/SQL.py +0 -8
- ml_tools/_core/_ETL_cleaning.py +0 -694
- ml_tools/_core/_IO_tools.py +0 -498
- ml_tools/_core/_ML_callbacks.py +0 -702
- ml_tools/_core/_ML_configuration.py +0 -1332
- ml_tools/_core/_ML_configuration_pytab.py +0 -102
- ml_tools/_core/_ML_evaluation.py +0 -867
- ml_tools/_core/_ML_evaluation_multi.py +0 -544
- ml_tools/_core/_ML_inference.py +0 -646
- ml_tools/_core/_ML_models.py +0 -668
- ml_tools/_core/_ML_models_pytab.py +0 -693
- ml_tools/_core/_ML_trainer.py +0 -2323
- ml_tools/_core/_ML_utilities.py +0 -886
- ml_tools/_core/_ML_vision_models.py +0 -644
- ml_tools/_core/_data_exploration.py +0 -1909
- ml_tools/_core/_optimization_tools.py +0 -493
- ml_tools/_core/_schema.py +0 -359
- ml_tools/plot_fonts.py +0 -8
- ml_tools/schema.py +0 -12
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
ml_tools/_core/_logger.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import sys
|
|
3
|
-
from typing import Optional, Union,
|
|
3
|
+
from typing import Optional, Union, Any
|
|
4
4
|
|
|
5
5
|
# Step 1: Conditionally import colorlog
|
|
6
6
|
try:
|
|
@@ -27,7 +27,7 @@ class _UnifiedFormatter(logging.Formatter):
|
|
|
27
27
|
A unified log formatter that adds emojis, uses level-specific formats,
|
|
28
28
|
and applies colors if colorlog is available.
|
|
29
29
|
"""
|
|
30
|
-
def __init__(self, datefmt: Optional[str] = None, log_colors: Optional[
|
|
30
|
+
def __init__(self, datefmt: Optional[str] = None, log_colors: Optional[dict[str, str]] = None):
|
|
31
31
|
"""Initializes the formatter, creating sub-formatters for each level."""
|
|
32
32
|
# Initialize the base logging.Formatter correctly
|
|
33
33
|
super().__init__(datefmt=datefmt)
|
|
@@ -60,7 +60,7 @@ class _ContextAdapter(logging.LoggerAdapter):
|
|
|
60
60
|
"""
|
|
61
61
|
Wraps the logger to automatically prepend the context name to the message.
|
|
62
62
|
"""
|
|
63
|
-
def process(self, msg: Any, kwargs:
|
|
63
|
+
def process(self, msg: Any, kwargs: dict[str, Any]) -> tuple[Any, dict[str, Any]]:
|
|
64
64
|
# Retrieve the context name from the extra dict passed during init
|
|
65
65
|
context = self.extra.get('context_name', 'Unknown') # type: ignore
|
|
66
66
|
return f"[{context}] {msg}", kwargs
|
|
@@ -75,7 +75,7 @@ def _setup_main_logger(name: str = "ml_tools", level: int = logging.INFO) -> log
|
|
|
75
75
|
|
|
76
76
|
# Prevents adding handlers multiple times if imported multiple times
|
|
77
77
|
if not logger.handlers:
|
|
78
|
-
formatter_kwargs:
|
|
78
|
+
formatter_kwargs: dict[str, Any] = {
|
|
79
79
|
'datefmt': '%Y-%m-%d %H:%M'
|
|
80
80
|
}
|
|
81
81
|
|
|
@@ -121,26 +121,16 @@ def get_logger(name: Optional[str] = None) -> Union[logging.Logger, logging.Logg
|
|
|
121
121
|
return _ROOT_LOGGER
|
|
122
122
|
|
|
123
123
|
|
|
124
|
-
# Maintain backward compatibility for scripts importing _LOGGER directly
|
|
125
|
-
_LOGGER = _ROOT_LOGGER
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def _log_and_exit(message: str, exit_code: int = 1):
|
|
129
|
-
"""Logs a critical message inside an exception block and terminates the program."""
|
|
130
|
-
_LOGGER.exception(message)
|
|
131
|
-
sys.exit(exit_code)
|
|
132
|
-
|
|
133
|
-
|
|
134
124
|
if __name__ == "__main__":
|
|
135
|
-
|
|
136
|
-
|
|
125
|
+
_ROOT_LOGGER.info("Data loading process started.")
|
|
126
|
+
_ROOT_LOGGER.warning("A non-critical configuration value is missing.")
|
|
137
127
|
|
|
138
128
|
try:
|
|
139
129
|
x = 1 / 0
|
|
140
130
|
except ZeroDivisionError:
|
|
141
|
-
|
|
131
|
+
_ROOT_LOGGER.exception("Critical error during calculation.")
|
|
142
132
|
|
|
143
|
-
|
|
133
|
+
_ROOT_LOGGER.critical("Total failure.")
|
|
144
134
|
|
|
145
135
|
test_logger = get_logger("SUPER CONTEXT")
|
|
146
136
|
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
__all__ = ["prepare_schema_from_json"]
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def prepare_schema_from_json(data: dict[str, Any]) -> dict[str, Any]:
|
|
8
|
+
"""
|
|
9
|
+
Processes a raw dictionary (loaded from JSON) into the clean arguments
|
|
10
|
+
required to instantiate a FeatureSchema.
|
|
11
|
+
|
|
12
|
+
Performs the following restorations:
|
|
13
|
+
1. Converts list fields back to tuples.
|
|
14
|
+
2. Converts string keys in 'categorical_index_map' back to integers.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
data (dict): The raw dictionary from a JSON file (e.g. from 'schema_dict').
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
dict: A dictionary of kwargs ready to be unpacked into FeatureSchema(**kwargs).
|
|
21
|
+
"""
|
|
22
|
+
# 1. Restore Tuples (JSON loads them as lists)
|
|
23
|
+
feature_names = tuple(data.get("feature_names", []))
|
|
24
|
+
cont_names = tuple(data.get("continuous_feature_names", []))
|
|
25
|
+
cat_names = tuple(data.get("categorical_feature_names", []))
|
|
26
|
+
|
|
27
|
+
# 2. Restore Integer Keys for categorical_index_map
|
|
28
|
+
raw_map = data.get("categorical_index_map")
|
|
29
|
+
cat_index_map: Optional[dict[int, int]] = None
|
|
30
|
+
if raw_map is not None:
|
|
31
|
+
# JSON keys are always strings; convert back to int
|
|
32
|
+
cat_index_map = {int(k): v for k, v in raw_map.items()}
|
|
33
|
+
|
|
34
|
+
# 3. Mappings (keys are strings, no conversion needed)
|
|
35
|
+
cat_mappings = data.get("categorical_mappings", None)
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
"feature_names": feature_names,
|
|
39
|
+
"continuous_feature_names": cont_names,
|
|
40
|
+
"categorical_feature_names": cat_names,
|
|
41
|
+
"categorical_index_map": cat_index_map,
|
|
42
|
+
"categorical_mappings": cat_mappings
|
|
43
|
+
}
|
ml_tools/_core/_script_info.py
CHANGED
|
@@ -1,54 +1,70 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from ._analysis import (
|
|
2
2
|
summarize_dataframe,
|
|
3
|
+
show_null_columns,
|
|
4
|
+
match_and_filter_columns_by_regex,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from ._cleaning import (
|
|
3
8
|
drop_constant_columns,
|
|
4
9
|
drop_rows_with_missing_data,
|
|
5
|
-
show_null_columns,
|
|
6
10
|
drop_columns_with_missing_data,
|
|
7
11
|
drop_macro,
|
|
8
12
|
clean_column_names,
|
|
13
|
+
clip_outliers_single,
|
|
14
|
+
clip_outliers_multi,
|
|
15
|
+
drop_outlier_samples,
|
|
16
|
+
standardize_percentages,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from ._plotting import (
|
|
9
20
|
plot_value_distributions,
|
|
10
21
|
plot_continuous_vs_target,
|
|
11
22
|
plot_categorical_vs_target,
|
|
12
|
-
|
|
23
|
+
plot_correlation_heatmap,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
from ._features import (
|
|
13
27
|
split_features_targets,
|
|
14
28
|
split_continuous_binary,
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
drop_outlier_samples,
|
|
18
|
-
plot_correlation_heatmap,
|
|
19
|
-
match_and_filter_columns_by_regex,
|
|
20
|
-
standardize_percentages,
|
|
29
|
+
split_continuous_categorical_targets,
|
|
30
|
+
encode_categorical_features,
|
|
21
31
|
reconstruct_one_hot,
|
|
22
32
|
reconstruct_binary,
|
|
23
33
|
reconstruct_multibinary,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
from ._schema_ops import (
|
|
24
37
|
finalize_feature_schema,
|
|
25
38
|
apply_feature_schema,
|
|
26
|
-
info
|
|
27
39
|
)
|
|
28
40
|
|
|
41
|
+
from ._imprimir import info
|
|
42
|
+
|
|
43
|
+
|
|
29
44
|
__all__ = [
|
|
30
45
|
"summarize_dataframe",
|
|
46
|
+
"show_null_columns",
|
|
31
47
|
"drop_constant_columns",
|
|
32
48
|
"drop_rows_with_missing_data",
|
|
33
|
-
"show_null_columns",
|
|
34
49
|
"drop_columns_with_missing_data",
|
|
35
50
|
"drop_macro",
|
|
36
51
|
"clean_column_names",
|
|
37
|
-
"plot_value_distributions",
|
|
38
|
-
"plot_continuous_vs_target",
|
|
39
|
-
"plot_categorical_vs_target",
|
|
52
|
+
"plot_value_distributions",
|
|
40
53
|
"split_features_targets",
|
|
54
|
+
"split_continuous_binary",
|
|
55
|
+
"split_continuous_categorical_targets",
|
|
41
56
|
"encode_categorical_features",
|
|
42
57
|
"clip_outliers_single",
|
|
43
58
|
"clip_outliers_multi",
|
|
44
59
|
"drop_outlier_samples",
|
|
60
|
+
"plot_continuous_vs_target",
|
|
61
|
+
"plot_categorical_vs_target",
|
|
45
62
|
"plot_correlation_heatmap",
|
|
46
63
|
"finalize_feature_schema",
|
|
64
|
+
"apply_feature_schema",
|
|
47
65
|
"match_and_filter_columns_by_regex",
|
|
48
66
|
"standardize_percentages",
|
|
49
67
|
"reconstruct_one_hot",
|
|
50
68
|
"reconstruct_binary",
|
|
51
69
|
"reconstruct_multibinary",
|
|
52
|
-
"split_continuous_binary",
|
|
53
|
-
"apply_feature_schema",
|
|
54
70
|
]
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import numpy as np
|
|
5
|
+
import re
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
|
|
8
|
+
from ..path_manager import make_fullpath, sanitize_filename
|
|
9
|
+
from .._core import get_logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_LOGGER = get_logger("Data Exploration: Analysis")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"summarize_dataframe",
|
|
17
|
+
"show_null_columns",
|
|
18
|
+
"match_and_filter_columns_by_regex",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
23
|
+
"""
|
|
24
|
+
Returns a summary DataFrame with data types, non-null counts, number of unique values,
|
|
25
|
+
missing value percentage, and basic statistics for each column.
|
|
26
|
+
|
|
27
|
+
Parameters:
|
|
28
|
+
df (pd.DataFrame): The input DataFrame.
|
|
29
|
+
round_digits (int): Decimal places to round numerical statistics.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
pd.DataFrame: Summary table.
|
|
33
|
+
"""
|
|
34
|
+
summary = pd.DataFrame({
|
|
35
|
+
'Data Type': df.dtypes,
|
|
36
|
+
'Completeness %': (df.notnull().mean() * 100).round(2),
|
|
37
|
+
'Unique Values': df.nunique(),
|
|
38
|
+
# 'Missing %': (df.isnull().mean() * 100).round(2)
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
# For numeric columns, add summary statistics
|
|
42
|
+
numeric_cols = df.select_dtypes(include='number').columns
|
|
43
|
+
if not numeric_cols.empty:
|
|
44
|
+
stats = df[numeric_cols].describe(percentiles=[.10, .25, .50, .70, .80, .90])
|
|
45
|
+
|
|
46
|
+
summary_numeric = stats.T[
|
|
47
|
+
['mean', 'std', 'min', '10%', '25%', '50%', '70%', '80%', '90%', 'max']
|
|
48
|
+
].round(round_digits)
|
|
49
|
+
summary = summary.join(summary_numeric, how='left')
|
|
50
|
+
|
|
51
|
+
print(f"DataFrame Shape: {df.shape}")
|
|
52
|
+
return summary
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def show_null_columns(
|
|
56
|
+
df: pd.DataFrame,
|
|
57
|
+
round_digits: int = 2,
|
|
58
|
+
plot_to_dir: Optional[Union[str, Path]] = None,
|
|
59
|
+
plot_filename: Optional[str] = None,
|
|
60
|
+
use_all_columns: bool = False
|
|
61
|
+
) -> pd.DataFrame:
|
|
62
|
+
"""
|
|
63
|
+
Returns a table of columns with missing values, showing both the count and
|
|
64
|
+
percentage of missing entries per column.
|
|
65
|
+
|
|
66
|
+
Optionally generates a visualization of the missing data profile.
|
|
67
|
+
|
|
68
|
+
Parameters:
|
|
69
|
+
df (pd.DataFrame): The input DataFrame.
|
|
70
|
+
round_digits (int): Number of decimal places for the percentage.
|
|
71
|
+
plot_to_dir (str | Path | None): If provided, saves a visualization of the
|
|
72
|
+
missing data to this directory.
|
|
73
|
+
plot_filename (str): The filename for the saved plot (without extension).
|
|
74
|
+
Used only if `plot_to_dir` is set.
|
|
75
|
+
use_all_columns (bool): If True, includes all columns in the summary and plot,
|
|
76
|
+
even those with no missing values.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
pd.DataFrame: A DataFrame summarizing missing values in each column.
|
|
80
|
+
"""
|
|
81
|
+
null_counts = df.isnull().sum()
|
|
82
|
+
null_percent = df.isnull().mean() * 100
|
|
83
|
+
|
|
84
|
+
if use_all_columns:
|
|
85
|
+
null_summary = pd.DataFrame({
|
|
86
|
+
'Missing Count': null_counts,
|
|
87
|
+
'Missing %': null_percent.round(round_digits)
|
|
88
|
+
})
|
|
89
|
+
else:
|
|
90
|
+
# Filter only columns with at least one null
|
|
91
|
+
mask = null_counts > 0
|
|
92
|
+
null_summary = pd.DataFrame({
|
|
93
|
+
'Missing Count': null_counts[mask],
|
|
94
|
+
'Missing %': null_percent[mask].round(round_digits)
|
|
95
|
+
})
|
|
96
|
+
|
|
97
|
+
# Sort by descending percentage of missing values
|
|
98
|
+
null_summary = null_summary.sort_values(by='Missing %', ascending=False)
|
|
99
|
+
|
|
100
|
+
# --- Visualization Logic ---
|
|
101
|
+
if plot_to_dir:
|
|
102
|
+
if null_summary.empty:
|
|
103
|
+
_LOGGER.info("No missing data found. Skipping plot generation.")
|
|
104
|
+
else:
|
|
105
|
+
try:
|
|
106
|
+
# Validate and create save directory
|
|
107
|
+
save_path = make_fullpath(plot_to_dir, make=True, enforce="directory")
|
|
108
|
+
|
|
109
|
+
# Prepare data
|
|
110
|
+
features = null_summary.index.tolist()
|
|
111
|
+
missing_pct = np.array(null_summary['Missing %'].values)
|
|
112
|
+
present_pct = 100 - missing_pct
|
|
113
|
+
n_features = len(features)
|
|
114
|
+
|
|
115
|
+
# Dynamic width
|
|
116
|
+
width = max(10, n_features * 0.4)
|
|
117
|
+
plt.figure(figsize=(width, 8))
|
|
118
|
+
|
|
119
|
+
# Stacked Bar Chart Logic
|
|
120
|
+
|
|
121
|
+
# Grid behind bars
|
|
122
|
+
plt.grid(axis='y', linestyle='--', alpha=0.5, zorder=0)
|
|
123
|
+
|
|
124
|
+
# 1. Present Data: Solid Green
|
|
125
|
+
plt.bar(
|
|
126
|
+
features,
|
|
127
|
+
present_pct,
|
|
128
|
+
color='tab:green',
|
|
129
|
+
label='Present',
|
|
130
|
+
width=0.6,
|
|
131
|
+
zorder=3
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# 2. Missing Data: Transparent Red Fill + Solid Red Hatch
|
|
135
|
+
# define facecolor (fill) with alpha, but edgecolor (lines) without alpha.
|
|
136
|
+
plt.bar(
|
|
137
|
+
features,
|
|
138
|
+
missing_pct,
|
|
139
|
+
bottom=present_pct,
|
|
140
|
+
facecolor=(1.0, 1.0, 1.0, 0.2), # RGBA
|
|
141
|
+
edgecolor='tab:red', # Solid red for the hatch lines
|
|
142
|
+
hatch='///', # hatch pattern
|
|
143
|
+
linewidth=0.4, # Ensure lines are thick enough to see
|
|
144
|
+
label='Missing',
|
|
145
|
+
width=0.6,
|
|
146
|
+
zorder=3
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Styling
|
|
150
|
+
plt.ylim(0, 100)
|
|
151
|
+
plt.ylabel("Data Completeness (%)", fontsize=13)
|
|
152
|
+
plt.yticks(np.arange(0, 101, 10))
|
|
153
|
+
plot_title = f"Missing Data - {plot_filename.replace('_', ' ')}" if plot_filename else "Missing Data"
|
|
154
|
+
plt.title(plot_title)
|
|
155
|
+
plt.xticks(rotation=45, ha='right', fontsize=9)
|
|
156
|
+
|
|
157
|
+
# Reference line
|
|
158
|
+
plt.axhline(y=100, color='black', linestyle='-', linewidth=0.5, alpha=0.3)
|
|
159
|
+
|
|
160
|
+
plt.legend(loc='lower right', framealpha=0.95)
|
|
161
|
+
plt.tight_layout()
|
|
162
|
+
|
|
163
|
+
# Save
|
|
164
|
+
if plot_filename is None or plot_filename.strip() == "":
|
|
165
|
+
plot_filename = "Missing_Data_Profile"
|
|
166
|
+
else:
|
|
167
|
+
plot_filename = "Missing_Data_" + sanitize_filename(plot_filename)
|
|
168
|
+
|
|
169
|
+
full_filename = plot_filename + ".svg"
|
|
170
|
+
plt.savefig(save_path / full_filename, format='svg', bbox_inches="tight")
|
|
171
|
+
plt.close()
|
|
172
|
+
|
|
173
|
+
_LOGGER.info(f"Saved missing data plot as '{full_filename}'")
|
|
174
|
+
|
|
175
|
+
except Exception as e:
|
|
176
|
+
_LOGGER.error(f"Failed to generate missing data plot. Error: {e}")
|
|
177
|
+
plt.close()
|
|
178
|
+
|
|
179
|
+
return null_summary
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def match_and_filter_columns_by_regex(
|
|
183
|
+
df: pd.DataFrame,
|
|
184
|
+
pattern: str,
|
|
185
|
+
case_sensitive: bool = False,
|
|
186
|
+
escape_pattern: bool = False
|
|
187
|
+
) -> tuple[pd.DataFrame, list[str]]:
|
|
188
|
+
"""
|
|
189
|
+
Return a tuple of (filtered DataFrame, matched column names) based on a regex pattern.
|
|
190
|
+
|
|
191
|
+
Parameters:
|
|
192
|
+
df (pd.DataFrame): The DataFrame to search.
|
|
193
|
+
pattern (str): The regex pattern to match column names (use a raw string).
|
|
194
|
+
case_sensitive (bool): Whether matching is case-sensitive.
|
|
195
|
+
escape_pattern (bool): If True, the pattern is escaped with `re.escape()` to treat it literally.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
(Tuple[pd.DataFrame, list[str]]): A DataFrame filtered to matched columns, and a list of matching column names.
|
|
199
|
+
"""
|
|
200
|
+
if escape_pattern:
|
|
201
|
+
pattern = re.escape(pattern)
|
|
202
|
+
|
|
203
|
+
mask = df.columns.str.contains(pattern, case=case_sensitive, regex=True)
|
|
204
|
+
matched_columns = df.columns[mask].to_list()
|
|
205
|
+
filtered_df = df.loc[:, mask]
|
|
206
|
+
|
|
207
|
+
_LOGGER.info(f"{len(matched_columns)} columns match the regex pattern '{pattern}'.")
|
|
208
|
+
|
|
209
|
+
# if filtered df is a series, convert to dataframe
|
|
210
|
+
if isinstance(filtered_df, pd.Series):
|
|
211
|
+
filtered_df = filtered_df.to_frame()
|
|
212
|
+
|
|
213
|
+
return filtered_df, matched_columns
|
|
214
|
+
|