dragon-ml-toolbox 19.13.0__py3-none-any.whl → 20.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
- dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
- ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
- ml_tools/ETL_cleaning/_basic_clean.py +351 -0
- ml_tools/ETL_cleaning/_clean_tools.py +128 -0
- ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
- ml_tools/ETL_cleaning/_imprimir.py +13 -0
- ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
- ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
- ml_tools/ETL_engineering/_imprimir.py +24 -0
- ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
- ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
- ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
- ml_tools/GUI_tools/_imprimir.py +12 -0
- ml_tools/IO_tools/_IO_loggers.py +235 -0
- ml_tools/IO_tools/_IO_save_load.py +151 -0
- ml_tools/IO_tools/_IO_utils.py +140 -0
- ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
- ml_tools/IO_tools/_imprimir.py +14 -0
- ml_tools/MICE/_MICE_imputation.py +132 -0
- ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
- ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
- ml_tools/MICE/_imprimir.py +11 -0
- ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
- ml_tools/ML_callbacks/_base.py +101 -0
- ml_tools/ML_callbacks/_checkpoint.py +232 -0
- ml_tools/ML_callbacks/_early_stop.py +208 -0
- ml_tools/ML_callbacks/_imprimir.py +12 -0
- ml_tools/ML_callbacks/_scheduler.py +197 -0
- ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
- ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
- ml_tools/ML_chain/_dragon_chain.py +140 -0
- ml_tools/ML_chain/_imprimir.py +11 -0
- ml_tools/ML_configuration/__init__.py +90 -0
- ml_tools/ML_configuration/_base_model_config.py +69 -0
- ml_tools/ML_configuration/_finalize.py +366 -0
- ml_tools/ML_configuration/_imprimir.py +47 -0
- ml_tools/ML_configuration/_metrics.py +593 -0
- ml_tools/ML_configuration/_models.py +206 -0
- ml_tools/ML_configuration/_training.py +124 -0
- ml_tools/ML_datasetmaster/__init__.py +28 -0
- ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
- ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
- ml_tools/ML_datasetmaster/_imprimir.py +15 -0
- ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
- ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
- ml_tools/ML_evaluation/__init__.py +53 -0
- ml_tools/ML_evaluation/_classification.py +629 -0
- ml_tools/ML_evaluation/_feature_importance.py +409 -0
- ml_tools/ML_evaluation/_imprimir.py +25 -0
- ml_tools/ML_evaluation/_loss.py +92 -0
- ml_tools/ML_evaluation/_regression.py +273 -0
- ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
- ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
- ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
- ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
- ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
- ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
- ml_tools/ML_finalize_handler/__init__.py +10 -0
- ml_tools/ML_finalize_handler/_imprimir.py +8 -0
- ml_tools/ML_inference/__init__.py +22 -0
- ml_tools/ML_inference/_base_inference.py +166 -0
- ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
- ml_tools/ML_inference/_dragon_inference.py +332 -0
- ml_tools/ML_inference/_imprimir.py +11 -0
- ml_tools/ML_inference/_multi_inference.py +180 -0
- ml_tools/ML_inference_sequence/__init__.py +10 -0
- ml_tools/ML_inference_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
- ml_tools/ML_inference_vision/__init__.py +10 -0
- ml_tools/ML_inference_vision/_imprimir.py +8 -0
- ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
- ml_tools/ML_models/__init__.py +32 -0
- ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
- ml_tools/ML_models/_base_mlp_attention.py +198 -0
- ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
- ml_tools/ML_models/_dragon_tabular.py +248 -0
- ml_tools/ML_models/_imprimir.py +18 -0
- ml_tools/ML_models/_mlp_attention.py +134 -0
- ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
- ml_tools/ML_models_sequence/__init__.py +10 -0
- ml_tools/ML_models_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
- ml_tools/ML_models_vision/__init__.py +29 -0
- ml_tools/ML_models_vision/_base_wrapper.py +254 -0
- ml_tools/ML_models_vision/_image_classification.py +182 -0
- ml_tools/ML_models_vision/_image_segmentation.py +108 -0
- ml_tools/ML_models_vision/_imprimir.py +16 -0
- ml_tools/ML_models_vision/_object_detection.py +135 -0
- ml_tools/ML_optimization/__init__.py +21 -0
- ml_tools/ML_optimization/_imprimir.py +13 -0
- ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
- ml_tools/ML_optimization/_single_dragon.py +203 -0
- ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
- ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
- ml_tools/ML_scaler/__init__.py +10 -0
- ml_tools/ML_scaler/_imprimir.py +8 -0
- ml_tools/ML_trainer/__init__.py +20 -0
- ml_tools/ML_trainer/_base_trainer.py +297 -0
- ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
- ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
- ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
- ml_tools/ML_trainer/_imprimir.py +10 -0
- ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
- ml_tools/ML_utilities/_artifact_finder.py +382 -0
- ml_tools/ML_utilities/_imprimir.py +16 -0
- ml_tools/ML_utilities/_inspection.py +325 -0
- ml_tools/ML_utilities/_train_tools.py +205 -0
- ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
- ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
- ml_tools/ML_vision_transformers/_imprimir.py +14 -0
- ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
- ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
- ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
- ml_tools/PSO_optimization/_imprimir.py +10 -0
- ml_tools/SQL/__init__.py +7 -0
- ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
- ml_tools/SQL/_imprimir.py +8 -0
- ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
- ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
- ml_tools/VIF/_imprimir.py +10 -0
- ml_tools/_core/__init__.py +7 -1
- ml_tools/_core/_logger.py +8 -18
- ml_tools/_core/_schema_load_ops.py +43 -0
- ml_tools/_core/_script_info.py +2 -2
- ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
- ml_tools/data_exploration/_analysis.py +214 -0
- ml_tools/data_exploration/_cleaning.py +566 -0
- ml_tools/data_exploration/_features.py +583 -0
- ml_tools/data_exploration/_imprimir.py +32 -0
- ml_tools/data_exploration/_plotting.py +487 -0
- ml_tools/data_exploration/_schema_ops.py +176 -0
- ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
- ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
- ml_tools/ensemble_evaluation/_imprimir.py +14 -0
- ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
- ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
- ml_tools/ensemble_inference/_imprimir.py +9 -0
- ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
- ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
- ml_tools/ensemble_learning/_imprimir.py +10 -0
- ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
- ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
- ml_tools/excel_handler/_imprimir.py +13 -0
- ml_tools/{keys.py → keys/__init__.py} +4 -1
- ml_tools/keys/_imprimir.py +11 -0
- ml_tools/{_core → keys}/_keys.py +2 -0
- ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
- ml_tools/math_utilities/_imprimir.py +11 -0
- ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
- ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
- ml_tools/optimization_tools/_imprimir.py +13 -0
- ml_tools/optimization_tools/_optimization_bounds.py +236 -0
- ml_tools/optimization_tools/_optimization_plots.py +218 -0
- ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
- ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
- ml_tools/path_manager/_imprimir.py +15 -0
- ml_tools/path_manager/_path_tools.py +346 -0
- ml_tools/plot_fonts/__init__.py +8 -0
- ml_tools/plot_fonts/_imprimir.py +8 -0
- ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
- ml_tools/schema/__init__.py +15 -0
- ml_tools/schema/_feature_schema.py +223 -0
- ml_tools/schema/_gui_schema.py +191 -0
- ml_tools/schema/_imprimir.py +10 -0
- ml_tools/{serde.py → serde/__init__.py} +4 -2
- ml_tools/serde/_imprimir.py +10 -0
- ml_tools/{_core → serde}/_serde.py +3 -8
- ml_tools/{utilities.py → utilities/__init__.py} +11 -6
- ml_tools/utilities/_imprimir.py +18 -0
- ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
- ml_tools/utilities/_utility_tools.py +192 -0
- dragon_ml_toolbox-19.13.0.dist-info/RECORD +0 -111
- ml_tools/ML_chaining_inference.py +0 -8
- ml_tools/ML_configuration.py +0 -86
- ml_tools/ML_configuration_pytab.py +0 -14
- ml_tools/ML_datasetmaster.py +0 -10
- ml_tools/ML_evaluation.py +0 -16
- ml_tools/ML_evaluation_multi.py +0 -12
- ml_tools/ML_finalize_handler.py +0 -8
- ml_tools/ML_inference.py +0 -12
- ml_tools/ML_models.py +0 -14
- ml_tools/ML_models_advanced.py +0 -14
- ml_tools/ML_models_pytab.py +0 -14
- ml_tools/ML_optimization.py +0 -14
- ml_tools/ML_optimization_pareto.py +0 -8
- ml_tools/ML_scaler.py +0 -8
- ml_tools/ML_sequence_datasetmaster.py +0 -8
- ml_tools/ML_sequence_evaluation.py +0 -10
- ml_tools/ML_sequence_inference.py +0 -8
- ml_tools/ML_sequence_models.py +0 -8
- ml_tools/ML_trainer.py +0 -12
- ml_tools/ML_vision_datasetmaster.py +0 -12
- ml_tools/ML_vision_evaluation.py +0 -10
- ml_tools/ML_vision_inference.py +0 -8
- ml_tools/ML_vision_models.py +0 -18
- ml_tools/SQL.py +0 -8
- ml_tools/_core/_ETL_cleaning.py +0 -694
- ml_tools/_core/_IO_tools.py +0 -498
- ml_tools/_core/_ML_callbacks.py +0 -702
- ml_tools/_core/_ML_configuration.py +0 -1332
- ml_tools/_core/_ML_configuration_pytab.py +0 -102
- ml_tools/_core/_ML_evaluation.py +0 -867
- ml_tools/_core/_ML_evaluation_multi.py +0 -544
- ml_tools/_core/_ML_inference.py +0 -646
- ml_tools/_core/_ML_models.py +0 -668
- ml_tools/_core/_ML_models_pytab.py +0 -693
- ml_tools/_core/_ML_trainer.py +0 -2323
- ml_tools/_core/_ML_utilities.py +0 -886
- ml_tools/_core/_ML_vision_models.py +0 -644
- ml_tools/_core/_data_exploration.py +0 -1901
- ml_tools/_core/_optimization_tools.py +0 -493
- ml_tools/_core/_schema.py +0 -359
- ml_tools/plot_fonts.py +0 -8
- ml_tools/schema.py +0 -12
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
from typing import Union, Any
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from ..path_manager import make_fullpath
|
|
6
|
+
|
|
7
|
+
from ..keys._keys import SchemaKeys
|
|
8
|
+
from .._core import get_logger
|
|
9
|
+
|
|
10
|
+
from ._feature_schema import FeatureSchema
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_LOGGER = get_logger("GUISchema")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"create_guischema_template",
|
|
18
|
+
"make_multibinary_groups",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_guischema_template(
|
|
23
|
+
directory: Union[str, Path],
|
|
24
|
+
feature_schema: FeatureSchema,
|
|
25
|
+
targets: list[str],
|
|
26
|
+
continuous_ranges: dict[str, tuple[float, float]],
|
|
27
|
+
multibinary_groups: Union[dict[str, list[str]], None] = None,
|
|
28
|
+
) -> None:
|
|
29
|
+
"""
|
|
30
|
+
Generates a 'GUISchema.json' boilerplate file based on the Model FeatureSchema.
|
|
31
|
+
|
|
32
|
+
The generated JSON contains entries with empty "gui_name" fields for manual mapping.
|
|
33
|
+
Leave 'gui_name' empty to use auto-formatted Title Case.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
directory (str | Path): Where to save the json file.
|
|
37
|
+
feature_schema (FeatureSchema): The source FeatureSchema object.
|
|
38
|
+
targets (list[str]): List of target names as used in the ML pipeline.
|
|
39
|
+
continuous_ranges (Dict[str, Tuple[float, float]]): Dict {model_name: (min, max)}.
|
|
40
|
+
multibinary_groups (Dict[str, list[str]] | None): Optional Dict {GUI_Group_Name: [model_col_1, model_col_2]}.
|
|
41
|
+
Used to group binary columns into a single multi-select list.
|
|
42
|
+
"""
|
|
43
|
+
dir_path = make_fullpath(directory, make=True, enforce="directory")
|
|
44
|
+
|
|
45
|
+
schema = feature_schema
|
|
46
|
+
output_data: dict[str, Any] = {
|
|
47
|
+
SchemaKeys.TARGETS: [],
|
|
48
|
+
SchemaKeys.CONTINUOUS: [],
|
|
49
|
+
SchemaKeys.BINARY: [],
|
|
50
|
+
SchemaKeys.MULTIBINARY: {}, # Structure: GroupName: [{model: x, gui: ""}]
|
|
51
|
+
SchemaKeys.CATEGORICAL: []
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Track handled columns to prevent duplicates in binary/categorical
|
|
55
|
+
handled_cols = set()
|
|
56
|
+
|
|
57
|
+
# 1. Targets
|
|
58
|
+
for t in targets:
|
|
59
|
+
output_data[SchemaKeys.TARGETS].append({
|
|
60
|
+
SchemaKeys.MODEL_NAME: t,
|
|
61
|
+
SchemaKeys.GUI_NAME: "" # User to fill
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
# 2. Continuous
|
|
65
|
+
# Validate ranges against schema
|
|
66
|
+
schema_cont_set = set(schema.continuous_feature_names)
|
|
67
|
+
for name, min_max in continuous_ranges.items():
|
|
68
|
+
if name in schema_cont_set:
|
|
69
|
+
output_data[SchemaKeys.CONTINUOUS].append({
|
|
70
|
+
SchemaKeys.MODEL_NAME: name,
|
|
71
|
+
SchemaKeys.GUI_NAME: "",
|
|
72
|
+
SchemaKeys.MIN_VALUE: min_max[0],
|
|
73
|
+
SchemaKeys.MAX_VALUE: min_max[1]
|
|
74
|
+
})
|
|
75
|
+
handled_cols.add(name)
|
|
76
|
+
else:
|
|
77
|
+
_LOGGER.warning(f"GUISchema: Provided range for '{name}', but it is not in FeatureSchema continuous list.")
|
|
78
|
+
|
|
79
|
+
# 3. Multi-Binary Groups
|
|
80
|
+
if multibinary_groups:
|
|
81
|
+
# Check for validity within the generic feature list
|
|
82
|
+
all_feats = set(schema.feature_names)
|
|
83
|
+
|
|
84
|
+
for group_name, cols in multibinary_groups.items():
|
|
85
|
+
# Validation: Groups cannot be empty
|
|
86
|
+
if not cols:
|
|
87
|
+
# warn and skip
|
|
88
|
+
_LOGGER.warning(f"GUISchema: Multi-binary group '{group_name}' is empty and will be skipped.")
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
group_options = []
|
|
92
|
+
for col in cols:
|
|
93
|
+
# Validation: Columns must exist in schema
|
|
94
|
+
if col not in all_feats:
|
|
95
|
+
# warn and skip
|
|
96
|
+
_LOGGER.warning(f"GUISchema: Multi-binary column '{col}' in group '{group_name}' not found in FeatureSchema. Skipping.")
|
|
97
|
+
continue
|
|
98
|
+
# else, add to group
|
|
99
|
+
group_options.append({
|
|
100
|
+
SchemaKeys.MODEL_NAME: col,
|
|
101
|
+
SchemaKeys.GUI_NAME: ""
|
|
102
|
+
})
|
|
103
|
+
handled_cols.add(col)
|
|
104
|
+
output_data[SchemaKeys.MULTIBINARY][group_name] = group_options
|
|
105
|
+
|
|
106
|
+
# 4. Binary & Categorical (Derived from Schema Mappings)
|
|
107
|
+
if schema.categorical_mappings:
|
|
108
|
+
for name, mapping in schema.categorical_mappings.items():
|
|
109
|
+
if name in handled_cols:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
# Heuristic: Cardinality 2 = Binary, >2 = Categorical
|
|
113
|
+
if len(mapping) == 2:
|
|
114
|
+
output_data[SchemaKeys.BINARY].append({
|
|
115
|
+
SchemaKeys.MODEL_NAME: name,
|
|
116
|
+
SchemaKeys.GUI_NAME: "" # User to fill
|
|
117
|
+
})
|
|
118
|
+
else:
|
|
119
|
+
# For categorical, we also allow renaming the specific options
|
|
120
|
+
options_with_names = {k: "" for k in mapping.keys()} # Default gui_option = model_option
|
|
121
|
+
|
|
122
|
+
output_data[SchemaKeys.CATEGORICAL].append({
|
|
123
|
+
SchemaKeys.MODEL_NAME: name,
|
|
124
|
+
SchemaKeys.GUI_NAME: "", # User to fill feature name
|
|
125
|
+
SchemaKeys.MAPPING: mapping, # Original mapping
|
|
126
|
+
SchemaKeys.OPTIONAL_LABELS: options_with_names # User can edit keys here
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
save_path = dir_path / SchemaKeys.GUI_SCHEMA_FILENAME
|
|
130
|
+
try:
|
|
131
|
+
with open(save_path, 'w', encoding='utf-8') as f:
|
|
132
|
+
json.dump(output_data, f, indent=4)
|
|
133
|
+
_LOGGER.info(f"GUISchema template generated at: '{dir_path.name}/{SchemaKeys.GUI_SCHEMA_FILENAME}'")
|
|
134
|
+
except IOError as e:
|
|
135
|
+
_LOGGER.error(f"Failed to save GUISchema template: {e}")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def make_multibinary_groups(
|
|
139
|
+
feature_schema: FeatureSchema,
|
|
140
|
+
group_prefixes: list[str],
|
|
141
|
+
separator: str = "_"
|
|
142
|
+
) -> dict[str, list[str]]:
|
|
143
|
+
"""
|
|
144
|
+
Helper to automate creating the multibinary_groups dictionary for create_guischema_template.
|
|
145
|
+
|
|
146
|
+
Iterates through provided prefixes and groups categorical features that contain
|
|
147
|
+
the pattern '{prefix}{separator}'.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
feature_schema: The loaded FeatureSchema containing categorical feature names.
|
|
151
|
+
group_prefixes: A list of group prefixes to search for.
|
|
152
|
+
separator: The separator used in Multibinary Encoding (default '_').
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Dict[str, list[str]]: A dictionary mapping group names to their found column names.
|
|
156
|
+
"""
|
|
157
|
+
groups: dict[str, list[str]] = {}
|
|
158
|
+
|
|
159
|
+
# check that categorical features exist
|
|
160
|
+
if not feature_schema.categorical_feature_names:
|
|
161
|
+
_LOGGER.error("FeatureSchema has no categorical features defined.")
|
|
162
|
+
raise ValueError()
|
|
163
|
+
|
|
164
|
+
# validate separator
|
|
165
|
+
if not separator or not isinstance(separator, str):
|
|
166
|
+
_LOGGER.error(f"Invalid separator '{separator}' of type {type(separator)}.")
|
|
167
|
+
raise ValueError()
|
|
168
|
+
|
|
169
|
+
for prefix in group_prefixes:
|
|
170
|
+
if not prefix or not isinstance(prefix, str):
|
|
171
|
+
_LOGGER.error(f"Invalid prefix '{prefix}' of type {type(prefix)}.")
|
|
172
|
+
raise ValueError()
|
|
173
|
+
|
|
174
|
+
search_term = f"{prefix}{separator}"
|
|
175
|
+
|
|
176
|
+
# check if substring exists in the column name. must begin with prefix+separator
|
|
177
|
+
cols = [
|
|
178
|
+
name for name in feature_schema.categorical_feature_names
|
|
179
|
+
if name.startswith(search_term)
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
if cols:
|
|
183
|
+
groups[prefix] = cols
|
|
184
|
+
else:
|
|
185
|
+
_LOGGER.warning(f"No columns found for group '{prefix}' using search term '{search_term}'")
|
|
186
|
+
|
|
187
|
+
# log resulting groups
|
|
188
|
+
_LOGGER.info(f"Multibinary groups created: {list(groups.keys())}")
|
|
189
|
+
|
|
190
|
+
return groups
|
|
191
|
+
|
|
@@ -3,9 +3,8 @@ from joblib.externals.loky.process_executor import TerminatedWorkerError
|
|
|
3
3
|
from typing import Any, Union, TypeVar, get_origin, Type, Optional
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from
|
|
8
|
-
from ._logger import get_logger
|
|
6
|
+
from ..path_manager import make_fullpath, sanitize_filename
|
|
7
|
+
from .._core import get_logger
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
_LOGGER = get_logger("SERDE")
|
|
@@ -95,7 +94,7 @@ def serialize_object(obj: Any, file_path: Path, verbose: bool = True, raise_on_e
|
|
|
95
94
|
|
|
96
95
|
# Define a TypeVar to link the expected type to the return type of deserialization
|
|
97
96
|
T = TypeVar('T')
|
|
98
|
-
|
|
97
|
+
|
|
99
98
|
def deserialize_object(
|
|
100
99
|
filepath: Union[str, Path],
|
|
101
100
|
expected_type: Optional[Type[T]] = None,
|
|
@@ -146,7 +145,3 @@ def deserialize_object(
|
|
|
146
145
|
_LOGGER.info(f"Loaded object '{obj}' from '{true_filepath}'.")
|
|
147
146
|
|
|
148
147
|
return obj # type: ignore
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def info():
|
|
152
|
-
_script_info(__all__)
|
|
@@ -1,27 +1,32 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from ._utility_save_load import (
|
|
2
2
|
load_dataframe,
|
|
3
3
|
load_dataframe_greedy,
|
|
4
4
|
load_dataframe_with_schema,
|
|
5
5
|
yield_dataframes_from_dir,
|
|
6
|
-
merge_dataframes,
|
|
7
6
|
save_dataframe_filename,
|
|
8
7
|
save_dataframe,
|
|
9
|
-
save_dataframe_with_schema
|
|
8
|
+
save_dataframe_with_schema
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from ._utility_tools import (
|
|
12
|
+
merge_dataframes,
|
|
10
13
|
distribute_dataset_by_target,
|
|
11
14
|
train_dataset_orchestrator,
|
|
12
|
-
train_dataset_yielder
|
|
13
|
-
info
|
|
15
|
+
train_dataset_yielder
|
|
14
16
|
)
|
|
15
17
|
|
|
18
|
+
from ._imprimir import info
|
|
19
|
+
|
|
20
|
+
|
|
16
21
|
__all__ = [
|
|
17
22
|
"load_dataframe",
|
|
18
23
|
"load_dataframe_greedy",
|
|
19
24
|
"load_dataframe_with_schema",
|
|
20
25
|
"yield_dataframes_from_dir",
|
|
21
|
-
"merge_dataframes",
|
|
22
26
|
"save_dataframe_filename",
|
|
23
27
|
"save_dataframe",
|
|
24
28
|
"save_dataframe_with_schema",
|
|
29
|
+
"merge_dataframes",
|
|
25
30
|
"distribute_dataset_by_target",
|
|
26
31
|
"train_dataset_orchestrator",
|
|
27
32
|
"train_dataset_yielder"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from .._core import _imprimir_disponibles
|
|
2
|
+
|
|
3
|
+
_GRUPOS = [
|
|
4
|
+
"load_dataframe",
|
|
5
|
+
"load_dataframe_greedy",
|
|
6
|
+
"load_dataframe_with_schema",
|
|
7
|
+
"yield_dataframes_from_dir",
|
|
8
|
+
"save_dataframe_filename",
|
|
9
|
+
"save_dataframe",
|
|
10
|
+
"save_dataframe_with_schema",
|
|
11
|
+
"merge_dataframes",
|
|
12
|
+
"distribute_dataset_by_target",
|
|
13
|
+
"train_dataset_orchestrator",
|
|
14
|
+
"train_dataset_yielder"
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
def info():
|
|
18
|
+
_imprimir_disponibles(_GRUPOS)
|
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
import numpy as np
|
|
2
1
|
import pandas as pd
|
|
3
2
|
import polars as pl
|
|
3
|
+
import numpy as np
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Literal, Union, Optional, Any,
|
|
5
|
+
from typing import Literal, Union, Optional, Any, overload
|
|
6
|
+
|
|
7
|
+
from ..schema import FeatureSchema
|
|
6
8
|
|
|
7
|
-
from
|
|
8
|
-
from
|
|
9
|
-
from ._logger import get_logger
|
|
10
|
-
from ._schema import FeatureSchema
|
|
9
|
+
from ..path_manager import make_fullpath, list_csv_paths, sanitize_filename
|
|
10
|
+
from .._core import get_logger
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
_LOGGER = get_logger("Utilities")
|
|
13
|
+
_LOGGER = get_logger("Save/Load Utilities")
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
__all__ = [
|
|
@@ -18,16 +18,13 @@ __all__ = [
|
|
|
18
18
|
"load_dataframe_greedy",
|
|
19
19
|
"load_dataframe_with_schema",
|
|
20
20
|
"yield_dataframes_from_dir",
|
|
21
|
-
"merge_dataframes",
|
|
22
21
|
"save_dataframe_filename",
|
|
23
22
|
"save_dataframe",
|
|
24
|
-
"save_dataframe_with_schema"
|
|
25
|
-
"distribute_dataset_by_target",
|
|
26
|
-
"train_dataset_orchestrator",
|
|
27
|
-
"train_dataset_yielder"
|
|
23
|
+
"save_dataframe_with_schema"
|
|
28
24
|
]
|
|
29
25
|
|
|
30
26
|
|
|
27
|
+
|
|
31
28
|
# Overload 1: When kind='pandas'
|
|
32
29
|
@overload
|
|
33
30
|
def load_dataframe(
|
|
@@ -36,7 +33,7 @@ def load_dataframe(
|
|
|
36
33
|
kind: Literal["pandas"] = "pandas",
|
|
37
34
|
all_strings: bool = False,
|
|
38
35
|
verbose: bool = True
|
|
39
|
-
) ->
|
|
36
|
+
) -> tuple[pd.DataFrame, str]:
|
|
40
37
|
... # for overload stubs
|
|
41
38
|
|
|
42
39
|
# Overload 2: When kind='polars'
|
|
@@ -47,7 +44,7 @@ def load_dataframe(
|
|
|
47
44
|
kind: Literal["polars"] = "polars",
|
|
48
45
|
all_strings: bool = False,
|
|
49
46
|
verbose: bool = True
|
|
50
|
-
) ->
|
|
47
|
+
) -> tuple[pl.DataFrame, str]:
|
|
51
48
|
... # for overload stubs
|
|
52
49
|
|
|
53
50
|
def load_dataframe(
|
|
@@ -56,7 +53,7 @@ def load_dataframe(
|
|
|
56
53
|
kind: Literal["pandas", "polars"] = "pandas",
|
|
57
54
|
all_strings: bool = False,
|
|
58
55
|
verbose: bool = True
|
|
59
|
-
) -> Union[
|
|
56
|
+
) -> Union[tuple[pd.DataFrame, str], tuple[pl.DataFrame, str]]:
|
|
60
57
|
"""
|
|
61
58
|
Load a CSV file into a DataFrame and extract its base name.
|
|
62
59
|
|
|
@@ -187,7 +184,7 @@ def load_dataframe_with_schema(
|
|
|
187
184
|
df_path: Union[str, Path],
|
|
188
185
|
schema: "FeatureSchema",
|
|
189
186
|
all_strings: bool = False,
|
|
190
|
-
) ->
|
|
187
|
+
) -> tuple[pd.DataFrame, str]:
|
|
191
188
|
"""
|
|
192
189
|
Loads a CSV file into a Pandas DataFrame, strictly validating its
|
|
193
190
|
feature columns against a FeatureSchema.
|
|
@@ -271,65 +268,6 @@ def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True)
|
|
|
271
268
|
yield df, df_name
|
|
272
269
|
|
|
273
270
|
|
|
274
|
-
def merge_dataframes(
|
|
275
|
-
*dfs: pd.DataFrame,
|
|
276
|
-
reset_index: bool = False,
|
|
277
|
-
direction: Literal["horizontal", "vertical"] = "horizontal",
|
|
278
|
-
verbose: bool=True
|
|
279
|
-
) -> pd.DataFrame:
|
|
280
|
-
"""
|
|
281
|
-
Merges multiple DataFrames either horizontally or vertically.
|
|
282
|
-
|
|
283
|
-
Parameters:
|
|
284
|
-
*dfs (pd.DataFrame): Variable number of DataFrames to merge.
|
|
285
|
-
reset_index (bool): Whether to reset index in the final merged DataFrame.
|
|
286
|
-
direction (["horizontal" | "vertical"]):
|
|
287
|
-
- "horizontal": Merge on index, adding columns.
|
|
288
|
-
- "vertical": Append rows; all DataFrames must have identical columns.
|
|
289
|
-
|
|
290
|
-
Returns:
|
|
291
|
-
pd.DataFrame: A single merged DataFrame.
|
|
292
|
-
|
|
293
|
-
Raises:
|
|
294
|
-
ValueError:
|
|
295
|
-
- If fewer than 2 DataFrames are provided.
|
|
296
|
-
- If indexes do not match for horizontal merge.
|
|
297
|
-
- If column names or order differ for vertical merge.
|
|
298
|
-
"""
|
|
299
|
-
if len(dfs) < 2:
|
|
300
|
-
raise ValueError("❌ At least 2 DataFrames must be provided.")
|
|
301
|
-
|
|
302
|
-
if verbose:
|
|
303
|
-
for i, df in enumerate(dfs, start=1):
|
|
304
|
-
print(f"➡️ DataFrame {i} shape: {df.shape}")
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
if direction == "horizontal":
|
|
308
|
-
reference_index = dfs[0].index
|
|
309
|
-
for i, df in enumerate(dfs, start=1):
|
|
310
|
-
if not df.index.equals(reference_index):
|
|
311
|
-
raise ValueError(f"❌ Indexes do not match: Dataset 1 and Dataset {i}.")
|
|
312
|
-
merged_df = pd.concat(dfs, axis=1)
|
|
313
|
-
|
|
314
|
-
elif direction == "vertical":
|
|
315
|
-
reference_columns = dfs[0].columns
|
|
316
|
-
for i, df in enumerate(dfs, start=1):
|
|
317
|
-
if not df.columns.equals(reference_columns):
|
|
318
|
-
raise ValueError(f"❌ Column names/order do not match: Dataset 1 and Dataset {i}.")
|
|
319
|
-
merged_df = pd.concat(dfs, axis=0)
|
|
320
|
-
|
|
321
|
-
else:
|
|
322
|
-
_LOGGER.error(f"Invalid merge direction: {direction}")
|
|
323
|
-
raise ValueError()
|
|
324
|
-
|
|
325
|
-
if reset_index:
|
|
326
|
-
merged_df = merged_df.reset_index(drop=True)
|
|
327
|
-
|
|
328
|
-
if verbose:
|
|
329
|
-
_LOGGER.info(f"Merged DataFrame shape: {merged_df.shape}")
|
|
330
|
-
|
|
331
|
-
return merged_df
|
|
332
|
-
|
|
333
271
|
|
|
334
272
|
def save_dataframe_filename(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Path], filename: str) -> None:
|
|
335
273
|
"""
|
|
@@ -448,118 +386,6 @@ def save_dataframe_with_schema(
|
|
|
448
386
|
save_dataframe(df=df_to_save, full_path=full_path)
|
|
449
387
|
|
|
450
388
|
|
|
451
|
-
def distribute_dataset_by_target(
|
|
452
|
-
df_or_path: Union[pd.DataFrame, str, Path],
|
|
453
|
-
target_columns: list[str],
|
|
454
|
-
verbose: bool = False
|
|
455
|
-
) -> Iterator[Tuple[str, pd.DataFrame]]:
|
|
456
|
-
"""
|
|
457
|
-
Yields cleaned DataFrames for each target column, where rows with missing
|
|
458
|
-
target values are removed. The target column is placed at the end.
|
|
459
|
-
|
|
460
|
-
Parameters
|
|
461
|
-
----------
|
|
462
|
-
df_or_path : [pd.DataFrame | str | Path]
|
|
463
|
-
Dataframe or path to Dataframe with all feature and target columns ready to split and train a model.
|
|
464
|
-
target_columns : List[str]
|
|
465
|
-
List of target column names to generate per-target DataFrames.
|
|
466
|
-
verbose: bool
|
|
467
|
-
Whether to print info for each yielded dataset.
|
|
468
|
-
|
|
469
|
-
Yields
|
|
470
|
-
------
|
|
471
|
-
Tuple[str, pd.DataFrame]
|
|
472
|
-
* Target name.
|
|
473
|
-
* Pandas DataFrame.
|
|
474
|
-
"""
|
|
475
|
-
# Validate path or dataframe
|
|
476
|
-
if isinstance(df_or_path, str) or isinstance(df_or_path, Path):
|
|
477
|
-
df_path = make_fullpath(df_or_path)
|
|
478
|
-
df, _ = load_dataframe(df_path)
|
|
479
|
-
else:
|
|
480
|
-
df = df_or_path
|
|
481
|
-
|
|
482
|
-
valid_targets = [col for col in df.columns if col in target_columns]
|
|
483
|
-
feature_columns = [col for col in df.columns if col not in valid_targets]
|
|
484
|
-
|
|
485
|
-
for target in valid_targets:
|
|
486
|
-
subset = df[feature_columns + [target]].dropna(subset=[target]) # type: ignore
|
|
487
|
-
if verbose:
|
|
488
|
-
print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
|
|
489
|
-
yield target, subset
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
|
|
493
|
-
target_columns: list[str],
|
|
494
|
-
save_dir: Union[str,Path],
|
|
495
|
-
safe_mode: bool=False):
|
|
496
|
-
"""
|
|
497
|
-
Orchestrates the creation of single-target datasets from multiple directories each with a variable number of CSV datasets.
|
|
498
|
-
|
|
499
|
-
This function iterates through a list of directories, finds all CSV files,
|
|
500
|
-
and splits each dataframe based on the provided target columns. Each resulting
|
|
501
|
-
single-target dataframe is then saved to a specified directory.
|
|
502
|
-
|
|
503
|
-
Parameters
|
|
504
|
-
----------
|
|
505
|
-
list_of_dirs : list[str | Path]
|
|
506
|
-
A list of directory paths where the source CSV files are located.
|
|
507
|
-
target_columns : list[str]
|
|
508
|
-
A list of column names to be used as targets for splitting the datasets.
|
|
509
|
-
save_dir : str | Path
|
|
510
|
-
The directory where the newly created single-target datasets will be saved.
|
|
511
|
-
safe_mode : bool
|
|
512
|
-
If True, prefixes the saved filename with the source directory name to prevent overwriting files with the same name from different sources.
|
|
513
|
-
"""
|
|
514
|
-
all_dir_paths: list[Path] = list()
|
|
515
|
-
for dir in list_of_dirs:
|
|
516
|
-
dir_path = make_fullpath(dir)
|
|
517
|
-
if not dir_path.is_dir():
|
|
518
|
-
_LOGGER.error(f"'{dir}' is not a directory.")
|
|
519
|
-
raise IOError()
|
|
520
|
-
all_dir_paths.append(dir_path)
|
|
521
|
-
|
|
522
|
-
# main loop
|
|
523
|
-
total_saved = 0
|
|
524
|
-
for df_dir in all_dir_paths:
|
|
525
|
-
for df_name, df_path in list_csv_paths(df_dir).items():
|
|
526
|
-
try:
|
|
527
|
-
for target_name, df in distribute_dataset_by_target(df_or_path=df_path, target_columns=target_columns, verbose=False):
|
|
528
|
-
if safe_mode:
|
|
529
|
-
filename = df_dir.name + '_' + target_name + '_' + df_name
|
|
530
|
-
else:
|
|
531
|
-
filename = target_name + '_' + df_name
|
|
532
|
-
save_dataframe_filename(df=df, save_dir=save_dir, filename=filename)
|
|
533
|
-
total_saved += 1
|
|
534
|
-
except Exception as e:
|
|
535
|
-
_LOGGER.error(f"Failed to process file '{df_path}'. Reason: {e}")
|
|
536
|
-
continue
|
|
537
|
-
|
|
538
|
-
_LOGGER.info(f"{total_saved} single-target datasets were created.")
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
def train_dataset_yielder(
|
|
542
|
-
df: pd.DataFrame,
|
|
543
|
-
target_cols: list[str]
|
|
544
|
-
) -> Iterator[Tuple[pd.DataFrame, pd.Series, list[str], str]]:
|
|
545
|
-
"""
|
|
546
|
-
Yields one tuple at a time:
|
|
547
|
-
(features_dataframe, target_series, feature_names, target_name)
|
|
548
|
-
|
|
549
|
-
Skips any target columns not found in the DataFrame.
|
|
550
|
-
"""
|
|
551
|
-
# Determine which target columns actually exist in the DataFrame
|
|
552
|
-
valid_targets = [col for col in target_cols if col in df.columns]
|
|
553
|
-
|
|
554
|
-
# Features = all columns excluding valid target columns
|
|
555
|
-
df_features = df.drop(columns=valid_targets)
|
|
556
|
-
feature_names = df_features.columns.to_list()
|
|
557
|
-
|
|
558
|
-
for target_col in valid_targets:
|
|
559
|
-
df_target = df[target_col]
|
|
560
|
-
yield (df_features, df_target, feature_names, target_col)
|
|
561
|
-
|
|
562
|
-
|
|
563
389
|
def _validate_and_reorder_schema(
|
|
564
390
|
df: pd.DataFrame,
|
|
565
391
|
schema: "FeatureSchema"
|
|
@@ -626,6 +452,3 @@ def _validate_and_reorder_schema(
|
|
|
626
452
|
|
|
627
453
|
return df_to_process # type: ignore
|
|
628
454
|
|
|
629
|
-
|
|
630
|
-
def info():
|
|
631
|
-
_script_info(__all__)
|