dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
  2. dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
  3. ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
  4. ml_tools/ETL_cleaning/_basic_clean.py +351 -0
  5. ml_tools/ETL_cleaning/_clean_tools.py +128 -0
  6. ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
  7. ml_tools/ETL_cleaning/_imprimir.py +13 -0
  8. ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
  9. ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
  10. ml_tools/ETL_engineering/_imprimir.py +24 -0
  11. ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
  12. ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
  13. ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
  14. ml_tools/GUI_tools/_imprimir.py +12 -0
  15. ml_tools/IO_tools/_IO_loggers.py +235 -0
  16. ml_tools/IO_tools/_IO_save_load.py +151 -0
  17. ml_tools/IO_tools/_IO_utils.py +140 -0
  18. ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
  19. ml_tools/IO_tools/_imprimir.py +14 -0
  20. ml_tools/MICE/_MICE_imputation.py +132 -0
  21. ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
  22. ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
  23. ml_tools/MICE/_imprimir.py +11 -0
  24. ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
  25. ml_tools/ML_callbacks/_base.py +101 -0
  26. ml_tools/ML_callbacks/_checkpoint.py +232 -0
  27. ml_tools/ML_callbacks/_early_stop.py +208 -0
  28. ml_tools/ML_callbacks/_imprimir.py +12 -0
  29. ml_tools/ML_callbacks/_scheduler.py +197 -0
  30. ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
  31. ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
  32. ml_tools/ML_chain/_dragon_chain.py +140 -0
  33. ml_tools/ML_chain/_imprimir.py +11 -0
  34. ml_tools/ML_configuration/__init__.py +90 -0
  35. ml_tools/ML_configuration/_base_model_config.py +69 -0
  36. ml_tools/ML_configuration/_finalize.py +366 -0
  37. ml_tools/ML_configuration/_imprimir.py +47 -0
  38. ml_tools/ML_configuration/_metrics.py +593 -0
  39. ml_tools/ML_configuration/_models.py +206 -0
  40. ml_tools/ML_configuration/_training.py +124 -0
  41. ml_tools/ML_datasetmaster/__init__.py +28 -0
  42. ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
  43. ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
  44. ml_tools/ML_datasetmaster/_imprimir.py +15 -0
  45. ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
  46. ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
  47. ml_tools/ML_evaluation/__init__.py +53 -0
  48. ml_tools/ML_evaluation/_classification.py +629 -0
  49. ml_tools/ML_evaluation/_feature_importance.py +409 -0
  50. ml_tools/ML_evaluation/_imprimir.py +25 -0
  51. ml_tools/ML_evaluation/_loss.py +92 -0
  52. ml_tools/ML_evaluation/_regression.py +273 -0
  53. ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
  54. ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
  55. ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
  56. ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
  57. ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
  58. ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
  59. ml_tools/ML_finalize_handler/__init__.py +10 -0
  60. ml_tools/ML_finalize_handler/_imprimir.py +8 -0
  61. ml_tools/ML_inference/__init__.py +22 -0
  62. ml_tools/ML_inference/_base_inference.py +166 -0
  63. ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
  64. ml_tools/ML_inference/_dragon_inference.py +332 -0
  65. ml_tools/ML_inference/_imprimir.py +11 -0
  66. ml_tools/ML_inference/_multi_inference.py +180 -0
  67. ml_tools/ML_inference_sequence/__init__.py +10 -0
  68. ml_tools/ML_inference_sequence/_imprimir.py +8 -0
  69. ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
  70. ml_tools/ML_inference_vision/__init__.py +10 -0
  71. ml_tools/ML_inference_vision/_imprimir.py +8 -0
  72. ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
  73. ml_tools/ML_models/__init__.py +32 -0
  74. ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
  75. ml_tools/ML_models/_base_mlp_attention.py +198 -0
  76. ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
  77. ml_tools/ML_models/_dragon_tabular.py +248 -0
  78. ml_tools/ML_models/_imprimir.py +18 -0
  79. ml_tools/ML_models/_mlp_attention.py +134 -0
  80. ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
  81. ml_tools/ML_models_sequence/__init__.py +10 -0
  82. ml_tools/ML_models_sequence/_imprimir.py +8 -0
  83. ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
  84. ml_tools/ML_models_vision/__init__.py +29 -0
  85. ml_tools/ML_models_vision/_base_wrapper.py +254 -0
  86. ml_tools/ML_models_vision/_image_classification.py +182 -0
  87. ml_tools/ML_models_vision/_image_segmentation.py +108 -0
  88. ml_tools/ML_models_vision/_imprimir.py +16 -0
  89. ml_tools/ML_models_vision/_object_detection.py +135 -0
  90. ml_tools/ML_optimization/__init__.py +21 -0
  91. ml_tools/ML_optimization/_imprimir.py +13 -0
  92. ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
  93. ml_tools/ML_optimization/_single_dragon.py +203 -0
  94. ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
  95. ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
  96. ml_tools/ML_scaler/__init__.py +10 -0
  97. ml_tools/ML_scaler/_imprimir.py +8 -0
  98. ml_tools/ML_trainer/__init__.py +20 -0
  99. ml_tools/ML_trainer/_base_trainer.py +297 -0
  100. ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
  101. ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
  102. ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
  103. ml_tools/ML_trainer/_imprimir.py +10 -0
  104. ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
  105. ml_tools/ML_utilities/_artifact_finder.py +382 -0
  106. ml_tools/ML_utilities/_imprimir.py +16 -0
  107. ml_tools/ML_utilities/_inspection.py +325 -0
  108. ml_tools/ML_utilities/_train_tools.py +205 -0
  109. ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
  110. ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
  111. ml_tools/ML_vision_transformers/_imprimir.py +14 -0
  112. ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
  113. ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
  114. ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
  115. ml_tools/PSO_optimization/_imprimir.py +10 -0
  116. ml_tools/SQL/__init__.py +7 -0
  117. ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
  118. ml_tools/SQL/_imprimir.py +8 -0
  119. ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
  120. ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
  121. ml_tools/VIF/_imprimir.py +10 -0
  122. ml_tools/_core/__init__.py +7 -1
  123. ml_tools/_core/_logger.py +8 -18
  124. ml_tools/_core/_schema_load_ops.py +43 -0
  125. ml_tools/_core/_script_info.py +2 -2
  126. ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
  127. ml_tools/data_exploration/_analysis.py +214 -0
  128. ml_tools/data_exploration/_cleaning.py +566 -0
  129. ml_tools/data_exploration/_features.py +583 -0
  130. ml_tools/data_exploration/_imprimir.py +32 -0
  131. ml_tools/data_exploration/_plotting.py +487 -0
  132. ml_tools/data_exploration/_schema_ops.py +176 -0
  133. ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
  134. ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
  135. ml_tools/ensemble_evaluation/_imprimir.py +14 -0
  136. ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
  137. ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
  138. ml_tools/ensemble_inference/_imprimir.py +9 -0
  139. ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
  140. ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
  141. ml_tools/ensemble_learning/_imprimir.py +10 -0
  142. ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
  143. ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
  144. ml_tools/excel_handler/_imprimir.py +13 -0
  145. ml_tools/{keys.py → keys/__init__.py} +4 -1
  146. ml_tools/keys/_imprimir.py +11 -0
  147. ml_tools/{_core → keys}/_keys.py +2 -0
  148. ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
  149. ml_tools/math_utilities/_imprimir.py +11 -0
  150. ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
  151. ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
  152. ml_tools/optimization_tools/_imprimir.py +13 -0
  153. ml_tools/optimization_tools/_optimization_bounds.py +236 -0
  154. ml_tools/optimization_tools/_optimization_plots.py +218 -0
  155. ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
  156. ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
  157. ml_tools/path_manager/_imprimir.py +15 -0
  158. ml_tools/path_manager/_path_tools.py +346 -0
  159. ml_tools/plot_fonts/__init__.py +8 -0
  160. ml_tools/plot_fonts/_imprimir.py +8 -0
  161. ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
  162. ml_tools/schema/__init__.py +15 -0
  163. ml_tools/schema/_feature_schema.py +223 -0
  164. ml_tools/schema/_gui_schema.py +191 -0
  165. ml_tools/schema/_imprimir.py +10 -0
  166. ml_tools/{serde.py → serde/__init__.py} +4 -2
  167. ml_tools/serde/_imprimir.py +10 -0
  168. ml_tools/{_core → serde}/_serde.py +3 -8
  169. ml_tools/{utilities.py → utilities/__init__.py} +11 -6
  170. ml_tools/utilities/_imprimir.py +18 -0
  171. ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
  172. ml_tools/utilities/_utility_tools.py +192 -0
  173. dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
  174. ml_tools/ML_chaining_inference.py +0 -8
  175. ml_tools/ML_configuration.py +0 -86
  176. ml_tools/ML_configuration_pytab.py +0 -14
  177. ml_tools/ML_datasetmaster.py +0 -10
  178. ml_tools/ML_evaluation.py +0 -16
  179. ml_tools/ML_evaluation_multi.py +0 -12
  180. ml_tools/ML_finalize_handler.py +0 -8
  181. ml_tools/ML_inference.py +0 -12
  182. ml_tools/ML_models.py +0 -14
  183. ml_tools/ML_models_advanced.py +0 -14
  184. ml_tools/ML_models_pytab.py +0 -14
  185. ml_tools/ML_optimization.py +0 -14
  186. ml_tools/ML_optimization_pareto.py +0 -8
  187. ml_tools/ML_scaler.py +0 -8
  188. ml_tools/ML_sequence_datasetmaster.py +0 -8
  189. ml_tools/ML_sequence_evaluation.py +0 -10
  190. ml_tools/ML_sequence_inference.py +0 -8
  191. ml_tools/ML_sequence_models.py +0 -8
  192. ml_tools/ML_trainer.py +0 -12
  193. ml_tools/ML_vision_datasetmaster.py +0 -12
  194. ml_tools/ML_vision_evaluation.py +0 -10
  195. ml_tools/ML_vision_inference.py +0 -8
  196. ml_tools/ML_vision_models.py +0 -18
  197. ml_tools/SQL.py +0 -8
  198. ml_tools/_core/_ETL_cleaning.py +0 -694
  199. ml_tools/_core/_IO_tools.py +0 -498
  200. ml_tools/_core/_ML_callbacks.py +0 -702
  201. ml_tools/_core/_ML_configuration.py +0 -1332
  202. ml_tools/_core/_ML_configuration_pytab.py +0 -102
  203. ml_tools/_core/_ML_evaluation.py +0 -867
  204. ml_tools/_core/_ML_evaluation_multi.py +0 -544
  205. ml_tools/_core/_ML_inference.py +0 -646
  206. ml_tools/_core/_ML_models.py +0 -668
  207. ml_tools/_core/_ML_models_pytab.py +0 -693
  208. ml_tools/_core/_ML_trainer.py +0 -2323
  209. ml_tools/_core/_ML_utilities.py +0 -886
  210. ml_tools/_core/_ML_vision_models.py +0 -644
  211. ml_tools/_core/_data_exploration.py +0 -1909
  212. ml_tools/_core/_optimization_tools.py +0 -493
  213. ml_tools/_core/_schema.py +0 -359
  214. ml_tools/plot_fonts.py +0 -8
  215. ml_tools/schema.py +0 -12
  216. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
  217. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
  218. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  219. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,191 @@
1
+ from typing import Union, Any
2
+ from pathlib import Path
3
+ import json
4
+
5
+ from ..path_manager import make_fullpath
6
+
7
+ from ..keys._keys import SchemaKeys
8
+ from .._core import get_logger
9
+
10
+ from ._feature_schema import FeatureSchema
11
+
12
+
13
+ _LOGGER = get_logger("GUISchema")
14
+
15
+
16
+ __all__ = [
17
+ "create_guischema_template",
18
+ "make_multibinary_groups",
19
+ ]
20
+
21
+
22
+ def create_guischema_template(
23
+ directory: Union[str, Path],
24
+ feature_schema: FeatureSchema,
25
+ targets: list[str],
26
+ continuous_ranges: dict[str, tuple[float, float]],
27
+ multibinary_groups: Union[dict[str, list[str]], None] = None,
28
+ ) -> None:
29
+ """
30
+ Generates a 'GUISchema.json' boilerplate file based on the Model FeatureSchema.
31
+
32
+ The generated JSON contains entries with empty "gui_name" fields for manual mapping.
33
+ Leave 'gui_name' empty to use auto-formatted Title Case.
34
+
35
+ Args:
36
+ directory (str | Path): Where to save the json file.
37
+ feature_schema (FeatureSchema): The source FeatureSchema object.
38
+ targets (list[str]): List of target names as used in the ML pipeline.
39
+ continuous_ranges (Dict[str, Tuple[float, float]]): Dict {model_name: (min, max)}.
40
+ multibinary_groups (Dict[str, list[str]] | None): Optional Dict {GUI_Group_Name: [model_col_1, model_col_2]}.
41
+ Used to group binary columns into a single multi-select list.
42
+ """
43
+ dir_path = make_fullpath(directory, make=True, enforce="directory")
44
+
45
+ schema = feature_schema
46
+ output_data: dict[str, Any] = {
47
+ SchemaKeys.TARGETS: [],
48
+ SchemaKeys.CONTINUOUS: [],
49
+ SchemaKeys.BINARY: [],
50
+ SchemaKeys.MULTIBINARY: {}, # Structure: GroupName: [{model: x, gui: ""}]
51
+ SchemaKeys.CATEGORICAL: []
52
+ }
53
+
54
+ # Track handled columns to prevent duplicates in binary/categorical
55
+ handled_cols = set()
56
+
57
+ # 1. Targets
58
+ for t in targets:
59
+ output_data[SchemaKeys.TARGETS].append({
60
+ SchemaKeys.MODEL_NAME: t,
61
+ SchemaKeys.GUI_NAME: "" # User to fill
62
+ })
63
+
64
+ # 2. Continuous
65
+ # Validate ranges against schema
66
+ schema_cont_set = set(schema.continuous_feature_names)
67
+ for name, min_max in continuous_ranges.items():
68
+ if name in schema_cont_set:
69
+ output_data[SchemaKeys.CONTINUOUS].append({
70
+ SchemaKeys.MODEL_NAME: name,
71
+ SchemaKeys.GUI_NAME: "",
72
+ SchemaKeys.MIN_VALUE: min_max[0],
73
+ SchemaKeys.MAX_VALUE: min_max[1]
74
+ })
75
+ handled_cols.add(name)
76
+ else:
77
+ _LOGGER.warning(f"GUISchema: Provided range for '{name}', but it is not in FeatureSchema continuous list.")
78
+
79
+ # 3. Multi-Binary Groups
80
+ if multibinary_groups:
81
+ # Check for validity within the generic feature list
82
+ all_feats = set(schema.feature_names)
83
+
84
+ for group_name, cols in multibinary_groups.items():
85
+ # Validation: Groups cannot be empty
86
+ if not cols:
87
+ # warn and skip
88
+ _LOGGER.warning(f"GUISchema: Multi-binary group '{group_name}' is empty and will be skipped.")
89
+ continue
90
+
91
+ group_options = []
92
+ for col in cols:
93
+ # Validation: Columns must exist in schema
94
+ if col not in all_feats:
95
+ # warn and skip
96
+ _LOGGER.warning(f"GUISchema: Multi-binary column '{col}' in group '{group_name}' not found in FeatureSchema. Skipping.")
97
+ continue
98
+ # else, add to group
99
+ group_options.append({
100
+ SchemaKeys.MODEL_NAME: col,
101
+ SchemaKeys.GUI_NAME: ""
102
+ })
103
+ handled_cols.add(col)
104
+ output_data[SchemaKeys.MULTIBINARY][group_name] = group_options
105
+
106
+ # 4. Binary & Categorical (Derived from Schema Mappings)
107
+ if schema.categorical_mappings:
108
+ for name, mapping in schema.categorical_mappings.items():
109
+ if name in handled_cols:
110
+ continue
111
+
112
+ # Heuristic: Cardinality 2 = Binary, >2 = Categorical
113
+ if len(mapping) == 2:
114
+ output_data[SchemaKeys.BINARY].append({
115
+ SchemaKeys.MODEL_NAME: name,
116
+ SchemaKeys.GUI_NAME: "" # User to fill
117
+ })
118
+ else:
119
+ # For categorical, we also allow renaming the specific options
120
+ options_with_names = {k: "" for k in mapping.keys()} # Default gui_option = model_option
121
+
122
+ output_data[SchemaKeys.CATEGORICAL].append({
123
+ SchemaKeys.MODEL_NAME: name,
124
+ SchemaKeys.GUI_NAME: "", # User to fill feature name
125
+ SchemaKeys.MAPPING: mapping, # Original mapping
126
+ SchemaKeys.OPTIONAL_LABELS: options_with_names # User can edit keys here
127
+ })
128
+
129
+ save_path = dir_path / SchemaKeys.GUI_SCHEMA_FILENAME
130
+ try:
131
+ with open(save_path, 'w', encoding='utf-8') as f:
132
+ json.dump(output_data, f, indent=4)
133
+ _LOGGER.info(f"GUISchema template generated at: '{dir_path.name}/{SchemaKeys.GUI_SCHEMA_FILENAME}'")
134
+ except IOError as e:
135
+ _LOGGER.error(f"Failed to save GUISchema template: {e}")
136
+
137
+
138
+ def make_multibinary_groups(
139
+ feature_schema: FeatureSchema,
140
+ group_prefixes: list[str],
141
+ separator: str = "_"
142
+ ) -> dict[str, list[str]]:
143
+ """
144
+ Helper to automate creating the multibinary_groups dictionary for create_guischema_template.
145
+
146
+ Iterates through provided prefixes and groups categorical features that contain
147
+ the pattern '{prefix}{separator}'.
148
+
149
+ Args:
150
+ feature_schema: The loaded FeatureSchema containing categorical feature names.
151
+ group_prefixes: A list of group prefixes to search for.
152
+ separator: The separator used in Multibinary Encoding (default '_').
153
+
154
+ Returns:
155
+ Dict[str, list[str]]: A dictionary mapping group names to their found column names.
156
+ """
157
+ groups: dict[str, list[str]] = {}
158
+
159
+ # check that categorical features exist
160
+ if not feature_schema.categorical_feature_names:
161
+ _LOGGER.error("FeatureSchema has no categorical features defined.")
162
+ raise ValueError()
163
+
164
+ # validate separator
165
+ if not separator or not isinstance(separator, str):
166
+ _LOGGER.error(f"Invalid separator '{separator}' of type {type(separator)}.")
167
+ raise ValueError()
168
+
169
+ for prefix in group_prefixes:
170
+ if not prefix or not isinstance(prefix, str):
171
+ _LOGGER.error(f"Invalid prefix '{prefix}' of type {type(prefix)}.")
172
+ raise ValueError()
173
+
174
+ search_term = f"{prefix}{separator}"
175
+
176
+ # check if substring exists in the column name. must begin with prefix+separator
177
+ cols = [
178
+ name for name in feature_schema.categorical_feature_names
179
+ if name.startswith(search_term)
180
+ ]
181
+
182
+ if cols:
183
+ groups[prefix] = cols
184
+ else:
185
+ _LOGGER.warning(f"No columns found for group '{prefix}' using search term '{search_term}'")
186
+
187
+ # log resulting groups
188
+ _LOGGER.info(f"Multibinary groups created: {list(groups.keys())}")
189
+
190
+ return groups
191
+
@@ -0,0 +1,10 @@
1
+ from .._core import _imprimir_disponibles
2
+
3
+ _GRUPOS = [
4
+ "FeatureSchema",
5
+ "create_guischema_template",
6
+ "make_multibinary_groups",
7
+ ]
8
+
9
+ def info():
10
+ _imprimir_disponibles(_GRUPOS)
@@ -1,10 +1,12 @@
1
- from ._core._serde import (
1
+ from ._serde import (
2
2
  serialize_object_filename,
3
3
  serialize_object,
4
4
  deserialize_object,
5
- info
6
5
  )
7
6
 
7
+ from ._imprimir import info
8
+
9
+
8
10
  __all__ = [
9
11
  "serialize_object_filename",
10
12
  "serialize_object",
@@ -0,0 +1,10 @@
1
+ from .._core import _imprimir_disponibles
2
+
3
+ _GRUPOS = [
4
+ "serialize_object_filename",
5
+ "serialize_object",
6
+ "deserialize_object",
7
+ ]
8
+
9
+ def info():
10
+ _imprimir_disponibles(_GRUPOS)
@@ -3,9 +3,8 @@ from joblib.externals.loky.process_executor import TerminatedWorkerError
3
3
  from typing import Any, Union, TypeVar, get_origin, Type, Optional
4
4
  from pathlib import Path
5
5
 
6
- from ._path_manager import make_fullpath, sanitize_filename
7
- from ._script_info import _script_info
8
- from ._logger import get_logger
6
+ from ..path_manager import make_fullpath, sanitize_filename
7
+ from .._core import get_logger
9
8
 
10
9
 
11
10
  _LOGGER = get_logger("SERDE")
@@ -95,7 +94,7 @@ def serialize_object(obj: Any, file_path: Path, verbose: bool = True, raise_on_e
95
94
 
96
95
  # Define a TypeVar to link the expected type to the return type of deserialization
97
96
  T = TypeVar('T')
98
-
97
+
99
98
  def deserialize_object(
100
99
  filepath: Union[str, Path],
101
100
  expected_type: Optional[Type[T]] = None,
@@ -146,7 +145,3 @@ def deserialize_object(
146
145
  _LOGGER.info(f"Loaded object '{obj}' from '{true_filepath}'.")
147
146
 
148
147
  return obj # type: ignore
149
-
150
-
151
- def info():
152
- _script_info(__all__)
@@ -1,27 +1,32 @@
1
- from ._core._utilities import (
1
+ from ._utility_save_load import (
2
2
  load_dataframe,
3
3
  load_dataframe_greedy,
4
4
  load_dataframe_with_schema,
5
5
  yield_dataframes_from_dir,
6
- merge_dataframes,
7
6
  save_dataframe_filename,
8
7
  save_dataframe,
9
- save_dataframe_with_schema,
8
+ save_dataframe_with_schema
9
+ )
10
+
11
+ from ._utility_tools import (
12
+ merge_dataframes,
10
13
  distribute_dataset_by_target,
11
14
  train_dataset_orchestrator,
12
- train_dataset_yielder,
13
- info
15
+ train_dataset_yielder
14
16
  )
15
17
 
18
+ from ._imprimir import info
19
+
20
+
16
21
  __all__ = [
17
22
  "load_dataframe",
18
23
  "load_dataframe_greedy",
19
24
  "load_dataframe_with_schema",
20
25
  "yield_dataframes_from_dir",
21
- "merge_dataframes",
22
26
  "save_dataframe_filename",
23
27
  "save_dataframe",
24
28
  "save_dataframe_with_schema",
29
+ "merge_dataframes",
25
30
  "distribute_dataset_by_target",
26
31
  "train_dataset_orchestrator",
27
32
  "train_dataset_yielder"
@@ -0,0 +1,18 @@
1
+ from .._core import _imprimir_disponibles
2
+
3
+ _GRUPOS = [
4
+ "load_dataframe",
5
+ "load_dataframe_greedy",
6
+ "load_dataframe_with_schema",
7
+ "yield_dataframes_from_dir",
8
+ "save_dataframe_filename",
9
+ "save_dataframe",
10
+ "save_dataframe_with_schema",
11
+ "merge_dataframes",
12
+ "distribute_dataset_by_target",
13
+ "train_dataset_orchestrator",
14
+ "train_dataset_yielder"
15
+ ]
16
+
17
+ def info():
18
+ _imprimir_disponibles(_GRUPOS)
@@ -1,16 +1,16 @@
1
- import numpy as np
2
1
  import pandas as pd
3
2
  import polars as pl
3
+ import numpy as np
4
4
  from pathlib import Path
5
- from typing import Literal, Union, Optional, Any, Iterator, Tuple, overload
5
+ from typing import Literal, Union, Optional, Any, overload
6
+
7
+ from ..schema import FeatureSchema
6
8
 
7
- from ._path_manager import sanitize_filename, make_fullpath, list_csv_paths
8
- from ._script_info import _script_info
9
- from ._logger import get_logger
10
- from ._schema import FeatureSchema
9
+ from ..path_manager import make_fullpath, list_csv_paths, sanitize_filename
10
+ from .._core import get_logger
11
11
 
12
12
 
13
- _LOGGER = get_logger("Utilities")
13
+ _LOGGER = get_logger("Save/Load Utilities")
14
14
 
15
15
 
16
16
  __all__ = [
@@ -18,16 +18,13 @@ __all__ = [
18
18
  "load_dataframe_greedy",
19
19
  "load_dataframe_with_schema",
20
20
  "yield_dataframes_from_dir",
21
- "merge_dataframes",
22
21
  "save_dataframe_filename",
23
22
  "save_dataframe",
24
- "save_dataframe_with_schema",
25
- "distribute_dataset_by_target",
26
- "train_dataset_orchestrator",
27
- "train_dataset_yielder"
23
+ "save_dataframe_with_schema"
28
24
  ]
29
25
 
30
26
 
27
+
31
28
  # Overload 1: When kind='pandas'
32
29
  @overload
33
30
  def load_dataframe(
@@ -36,7 +33,7 @@ def load_dataframe(
36
33
  kind: Literal["pandas"] = "pandas",
37
34
  all_strings: bool = False,
38
35
  verbose: bool = True
39
- ) -> Tuple[pd.DataFrame, str]:
36
+ ) -> tuple[pd.DataFrame, str]:
40
37
  ... # for overload stubs
41
38
 
42
39
  # Overload 2: When kind='polars'
@@ -47,7 +44,7 @@ def load_dataframe(
47
44
  kind: Literal["polars"] = "polars",
48
45
  all_strings: bool = False,
49
46
  verbose: bool = True
50
- ) -> Tuple[pl.DataFrame, str]:
47
+ ) -> tuple[pl.DataFrame, str]:
51
48
  ... # for overload stubs
52
49
 
53
50
  def load_dataframe(
@@ -56,7 +53,7 @@ def load_dataframe(
56
53
  kind: Literal["pandas", "polars"] = "pandas",
57
54
  all_strings: bool = False,
58
55
  verbose: bool = True
59
- ) -> Union[Tuple[pd.DataFrame, str], Tuple[pl.DataFrame, str]]:
56
+ ) -> Union[tuple[pd.DataFrame, str], tuple[pl.DataFrame, str]]:
60
57
  """
61
58
  Load a CSV file into a DataFrame and extract its base name.
62
59
 
@@ -187,7 +184,7 @@ def load_dataframe_with_schema(
187
184
  df_path: Union[str, Path],
188
185
  schema: "FeatureSchema",
189
186
  all_strings: bool = False,
190
- ) -> Tuple[pd.DataFrame, str]:
187
+ ) -> tuple[pd.DataFrame, str]:
191
188
  """
192
189
  Loads a CSV file into a Pandas DataFrame, strictly validating its
193
190
  feature columns against a FeatureSchema.
@@ -271,65 +268,6 @@ def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True)
271
268
  yield df, df_name
272
269
 
273
270
 
274
- def merge_dataframes(
275
- *dfs: pd.DataFrame,
276
- reset_index: bool = False,
277
- direction: Literal["horizontal", "vertical"] = "horizontal",
278
- verbose: bool=True
279
- ) -> pd.DataFrame:
280
- """
281
- Merges multiple DataFrames either horizontally or vertically.
282
-
283
- Parameters:
284
- *dfs (pd.DataFrame): Variable number of DataFrames to merge.
285
- reset_index (bool): Whether to reset index in the final merged DataFrame.
286
- direction (["horizontal" | "vertical"]):
287
- - "horizontal": Merge on index, adding columns.
288
- - "vertical": Append rows; all DataFrames must have identical columns.
289
-
290
- Returns:
291
- pd.DataFrame: A single merged DataFrame.
292
-
293
- Raises:
294
- ValueError:
295
- - If fewer than 2 DataFrames are provided.
296
- - If indexes do not match for horizontal merge.
297
- - If column names or order differ for vertical merge.
298
- """
299
- if len(dfs) < 2:
300
- raise ValueError("❌ At least 2 DataFrames must be provided.")
301
-
302
- if verbose:
303
- for i, df in enumerate(dfs, start=1):
304
- print(f"➡️ DataFrame {i} shape: {df.shape}")
305
-
306
-
307
- if direction == "horizontal":
308
- reference_index = dfs[0].index
309
- for i, df in enumerate(dfs, start=1):
310
- if not df.index.equals(reference_index):
311
- raise ValueError(f"❌ Indexes do not match: Dataset 1 and Dataset {i}.")
312
- merged_df = pd.concat(dfs, axis=1)
313
-
314
- elif direction == "vertical":
315
- reference_columns = dfs[0].columns
316
- for i, df in enumerate(dfs, start=1):
317
- if not df.columns.equals(reference_columns):
318
- raise ValueError(f"❌ Column names/order do not match: Dataset 1 and Dataset {i}.")
319
- merged_df = pd.concat(dfs, axis=0)
320
-
321
- else:
322
- _LOGGER.error(f"Invalid merge direction: {direction}")
323
- raise ValueError()
324
-
325
- if reset_index:
326
- merged_df = merged_df.reset_index(drop=True)
327
-
328
- if verbose:
329
- _LOGGER.info(f"Merged DataFrame shape: {merged_df.shape}")
330
-
331
- return merged_df
332
-
333
271
 
334
272
  def save_dataframe_filename(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Path], filename: str) -> None:
335
273
  """
@@ -448,118 +386,6 @@ def save_dataframe_with_schema(
448
386
  save_dataframe(df=df_to_save, full_path=full_path)
449
387
 
450
388
 
451
- def distribute_dataset_by_target(
452
- df_or_path: Union[pd.DataFrame, str, Path],
453
- target_columns: list[str],
454
- verbose: bool = False
455
- ) -> Iterator[Tuple[str, pd.DataFrame]]:
456
- """
457
- Yields cleaned DataFrames for each target column, where rows with missing
458
- target values are removed. The target column is placed at the end.
459
-
460
- Parameters
461
- ----------
462
- df_or_path : [pd.DataFrame | str | Path]
463
- Dataframe or path to Dataframe with all feature and target columns ready to split and train a model.
464
- target_columns : List[str]
465
- List of target column names to generate per-target DataFrames.
466
- verbose: bool
467
- Whether to print info for each yielded dataset.
468
-
469
- Yields
470
- ------
471
- Tuple[str, pd.DataFrame]
472
- * Target name.
473
- * Pandas DataFrame.
474
- """
475
- # Validate path or dataframe
476
- if isinstance(df_or_path, str) or isinstance(df_or_path, Path):
477
- df_path = make_fullpath(df_or_path)
478
- df, _ = load_dataframe(df_path)
479
- else:
480
- df = df_or_path
481
-
482
- valid_targets = [col for col in df.columns if col in target_columns]
483
- feature_columns = [col for col in df.columns if col not in valid_targets]
484
-
485
- for target in valid_targets:
486
- subset = df[feature_columns + [target]].dropna(subset=[target]) # type: ignore
487
- if verbose:
488
- print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
489
- yield target, subset
490
-
491
-
492
- def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
493
- target_columns: list[str],
494
- save_dir: Union[str,Path],
495
- safe_mode: bool=False):
496
- """
497
- Orchestrates the creation of single-target datasets from multiple directories each with a variable number of CSV datasets.
498
-
499
- This function iterates through a list of directories, finds all CSV files,
500
- and splits each dataframe based on the provided target columns. Each resulting
501
- single-target dataframe is then saved to a specified directory.
502
-
503
- Parameters
504
- ----------
505
- list_of_dirs : list[str | Path]
506
- A list of directory paths where the source CSV files are located.
507
- target_columns : list[str]
508
- A list of column names to be used as targets for splitting the datasets.
509
- save_dir : str | Path
510
- The directory where the newly created single-target datasets will be saved.
511
- safe_mode : bool
512
- If True, prefixes the saved filename with the source directory name to prevent overwriting files with the same name from different sources.
513
- """
514
- all_dir_paths: list[Path] = list()
515
- for dir in list_of_dirs:
516
- dir_path = make_fullpath(dir)
517
- if not dir_path.is_dir():
518
- _LOGGER.error(f"'{dir}' is not a directory.")
519
- raise IOError()
520
- all_dir_paths.append(dir_path)
521
-
522
- # main loop
523
- total_saved = 0
524
- for df_dir in all_dir_paths:
525
- for df_name, df_path in list_csv_paths(df_dir).items():
526
- try:
527
- for target_name, df in distribute_dataset_by_target(df_or_path=df_path, target_columns=target_columns, verbose=False):
528
- if safe_mode:
529
- filename = df_dir.name + '_' + target_name + '_' + df_name
530
- else:
531
- filename = target_name + '_' + df_name
532
- save_dataframe_filename(df=df, save_dir=save_dir, filename=filename)
533
- total_saved += 1
534
- except Exception as e:
535
- _LOGGER.error(f"Failed to process file '{df_path}'. Reason: {e}")
536
- continue
537
-
538
- _LOGGER.info(f"{total_saved} single-target datasets were created.")
539
-
540
-
541
- def train_dataset_yielder(
542
- df: pd.DataFrame,
543
- target_cols: list[str]
544
- ) -> Iterator[Tuple[pd.DataFrame, pd.Series, list[str], str]]:
545
- """
546
- Yields one tuple at a time:
547
- (features_dataframe, target_series, feature_names, target_name)
548
-
549
- Skips any target columns not found in the DataFrame.
550
- """
551
- # Determine which target columns actually exist in the DataFrame
552
- valid_targets = [col for col in target_cols if col in df.columns]
553
-
554
- # Features = all columns excluding valid target columns
555
- df_features = df.drop(columns=valid_targets)
556
- feature_names = df_features.columns.to_list()
557
-
558
- for target_col in valid_targets:
559
- df_target = df[target_col]
560
- yield (df_features, df_target, feature_names, target_col)
561
-
562
-
563
389
  def _validate_and_reorder_schema(
564
390
  df: pd.DataFrame,
565
391
  schema: "FeatureSchema"
@@ -626,6 +452,3 @@ def _validate_and_reorder_schema(
626
452
 
627
453
  return df_to_process # type: ignore
628
454
 
629
-
630
- def info():
631
- _script_info(__all__)