dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
  2. dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
  3. ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
  4. ml_tools/ETL_cleaning/_basic_clean.py +351 -0
  5. ml_tools/ETL_cleaning/_clean_tools.py +128 -0
  6. ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
  7. ml_tools/ETL_cleaning/_imprimir.py +13 -0
  8. ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
  9. ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
  10. ml_tools/ETL_engineering/_imprimir.py +24 -0
  11. ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
  12. ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
  13. ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
  14. ml_tools/GUI_tools/_imprimir.py +12 -0
  15. ml_tools/IO_tools/_IO_loggers.py +235 -0
  16. ml_tools/IO_tools/_IO_save_load.py +151 -0
  17. ml_tools/IO_tools/_IO_utils.py +140 -0
  18. ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
  19. ml_tools/IO_tools/_imprimir.py +14 -0
  20. ml_tools/MICE/_MICE_imputation.py +132 -0
  21. ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
  22. ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
  23. ml_tools/MICE/_imprimir.py +11 -0
  24. ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
  25. ml_tools/ML_callbacks/_base.py +101 -0
  26. ml_tools/ML_callbacks/_checkpoint.py +232 -0
  27. ml_tools/ML_callbacks/_early_stop.py +208 -0
  28. ml_tools/ML_callbacks/_imprimir.py +12 -0
  29. ml_tools/ML_callbacks/_scheduler.py +197 -0
  30. ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
  31. ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
  32. ml_tools/ML_chain/_dragon_chain.py +140 -0
  33. ml_tools/ML_chain/_imprimir.py +11 -0
  34. ml_tools/ML_configuration/__init__.py +90 -0
  35. ml_tools/ML_configuration/_base_model_config.py +69 -0
  36. ml_tools/ML_configuration/_finalize.py +366 -0
  37. ml_tools/ML_configuration/_imprimir.py +47 -0
  38. ml_tools/ML_configuration/_metrics.py +593 -0
  39. ml_tools/ML_configuration/_models.py +206 -0
  40. ml_tools/ML_configuration/_training.py +124 -0
  41. ml_tools/ML_datasetmaster/__init__.py +28 -0
  42. ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
  43. ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
  44. ml_tools/ML_datasetmaster/_imprimir.py +15 -0
  45. ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
  46. ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
  47. ml_tools/ML_evaluation/__init__.py +53 -0
  48. ml_tools/ML_evaluation/_classification.py +629 -0
  49. ml_tools/ML_evaluation/_feature_importance.py +409 -0
  50. ml_tools/ML_evaluation/_imprimir.py +25 -0
  51. ml_tools/ML_evaluation/_loss.py +92 -0
  52. ml_tools/ML_evaluation/_regression.py +273 -0
  53. ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
  54. ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
  55. ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
  56. ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
  57. ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
  58. ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
  59. ml_tools/ML_finalize_handler/__init__.py +10 -0
  60. ml_tools/ML_finalize_handler/_imprimir.py +8 -0
  61. ml_tools/ML_inference/__init__.py +22 -0
  62. ml_tools/ML_inference/_base_inference.py +166 -0
  63. ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
  64. ml_tools/ML_inference/_dragon_inference.py +332 -0
  65. ml_tools/ML_inference/_imprimir.py +11 -0
  66. ml_tools/ML_inference/_multi_inference.py +180 -0
  67. ml_tools/ML_inference_sequence/__init__.py +10 -0
  68. ml_tools/ML_inference_sequence/_imprimir.py +8 -0
  69. ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
  70. ml_tools/ML_inference_vision/__init__.py +10 -0
  71. ml_tools/ML_inference_vision/_imprimir.py +8 -0
  72. ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
  73. ml_tools/ML_models/__init__.py +32 -0
  74. ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
  75. ml_tools/ML_models/_base_mlp_attention.py +198 -0
  76. ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
  77. ml_tools/ML_models/_dragon_tabular.py +248 -0
  78. ml_tools/ML_models/_imprimir.py +18 -0
  79. ml_tools/ML_models/_mlp_attention.py +134 -0
  80. ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
  81. ml_tools/ML_models_sequence/__init__.py +10 -0
  82. ml_tools/ML_models_sequence/_imprimir.py +8 -0
  83. ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
  84. ml_tools/ML_models_vision/__init__.py +29 -0
  85. ml_tools/ML_models_vision/_base_wrapper.py +254 -0
  86. ml_tools/ML_models_vision/_image_classification.py +182 -0
  87. ml_tools/ML_models_vision/_image_segmentation.py +108 -0
  88. ml_tools/ML_models_vision/_imprimir.py +16 -0
  89. ml_tools/ML_models_vision/_object_detection.py +135 -0
  90. ml_tools/ML_optimization/__init__.py +21 -0
  91. ml_tools/ML_optimization/_imprimir.py +13 -0
  92. ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
  93. ml_tools/ML_optimization/_single_dragon.py +203 -0
  94. ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
  95. ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
  96. ml_tools/ML_scaler/__init__.py +10 -0
  97. ml_tools/ML_scaler/_imprimir.py +8 -0
  98. ml_tools/ML_trainer/__init__.py +20 -0
  99. ml_tools/ML_trainer/_base_trainer.py +297 -0
  100. ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
  101. ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
  102. ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
  103. ml_tools/ML_trainer/_imprimir.py +10 -0
  104. ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
  105. ml_tools/ML_utilities/_artifact_finder.py +382 -0
  106. ml_tools/ML_utilities/_imprimir.py +16 -0
  107. ml_tools/ML_utilities/_inspection.py +325 -0
  108. ml_tools/ML_utilities/_train_tools.py +205 -0
  109. ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
  110. ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
  111. ml_tools/ML_vision_transformers/_imprimir.py +14 -0
  112. ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
  113. ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
  114. ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
  115. ml_tools/PSO_optimization/_imprimir.py +10 -0
  116. ml_tools/SQL/__init__.py +7 -0
  117. ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
  118. ml_tools/SQL/_imprimir.py +8 -0
  119. ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
  120. ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
  121. ml_tools/VIF/_imprimir.py +10 -0
  122. ml_tools/_core/__init__.py +7 -1
  123. ml_tools/_core/_logger.py +8 -18
  124. ml_tools/_core/_schema_load_ops.py +43 -0
  125. ml_tools/_core/_script_info.py +2 -2
  126. ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
  127. ml_tools/data_exploration/_analysis.py +214 -0
  128. ml_tools/data_exploration/_cleaning.py +566 -0
  129. ml_tools/data_exploration/_features.py +583 -0
  130. ml_tools/data_exploration/_imprimir.py +32 -0
  131. ml_tools/data_exploration/_plotting.py +487 -0
  132. ml_tools/data_exploration/_schema_ops.py +176 -0
  133. ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
  134. ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
  135. ml_tools/ensemble_evaluation/_imprimir.py +14 -0
  136. ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
  137. ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
  138. ml_tools/ensemble_inference/_imprimir.py +9 -0
  139. ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
  140. ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
  141. ml_tools/ensemble_learning/_imprimir.py +10 -0
  142. ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
  143. ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
  144. ml_tools/excel_handler/_imprimir.py +13 -0
  145. ml_tools/{keys.py → keys/__init__.py} +4 -1
  146. ml_tools/keys/_imprimir.py +11 -0
  147. ml_tools/{_core → keys}/_keys.py +2 -0
  148. ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
  149. ml_tools/math_utilities/_imprimir.py +11 -0
  150. ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
  151. ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
  152. ml_tools/optimization_tools/_imprimir.py +13 -0
  153. ml_tools/optimization_tools/_optimization_bounds.py +236 -0
  154. ml_tools/optimization_tools/_optimization_plots.py +218 -0
  155. ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
  156. ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
  157. ml_tools/path_manager/_imprimir.py +15 -0
  158. ml_tools/path_manager/_path_tools.py +346 -0
  159. ml_tools/plot_fonts/__init__.py +8 -0
  160. ml_tools/plot_fonts/_imprimir.py +8 -0
  161. ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
  162. ml_tools/schema/__init__.py +15 -0
  163. ml_tools/schema/_feature_schema.py +223 -0
  164. ml_tools/schema/_gui_schema.py +191 -0
  165. ml_tools/schema/_imprimir.py +10 -0
  166. ml_tools/{serde.py → serde/__init__.py} +4 -2
  167. ml_tools/serde/_imprimir.py +10 -0
  168. ml_tools/{_core → serde}/_serde.py +3 -8
  169. ml_tools/{utilities.py → utilities/__init__.py} +11 -6
  170. ml_tools/utilities/_imprimir.py +18 -0
  171. ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
  172. ml_tools/utilities/_utility_tools.py +192 -0
  173. dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
  174. ml_tools/ML_chaining_inference.py +0 -8
  175. ml_tools/ML_configuration.py +0 -86
  176. ml_tools/ML_configuration_pytab.py +0 -14
  177. ml_tools/ML_datasetmaster.py +0 -10
  178. ml_tools/ML_evaluation.py +0 -16
  179. ml_tools/ML_evaluation_multi.py +0 -12
  180. ml_tools/ML_finalize_handler.py +0 -8
  181. ml_tools/ML_inference.py +0 -12
  182. ml_tools/ML_models.py +0 -14
  183. ml_tools/ML_models_advanced.py +0 -14
  184. ml_tools/ML_models_pytab.py +0 -14
  185. ml_tools/ML_optimization.py +0 -14
  186. ml_tools/ML_optimization_pareto.py +0 -8
  187. ml_tools/ML_scaler.py +0 -8
  188. ml_tools/ML_sequence_datasetmaster.py +0 -8
  189. ml_tools/ML_sequence_evaluation.py +0 -10
  190. ml_tools/ML_sequence_inference.py +0 -8
  191. ml_tools/ML_sequence_models.py +0 -8
  192. ml_tools/ML_trainer.py +0 -12
  193. ml_tools/ML_vision_datasetmaster.py +0 -12
  194. ml_tools/ML_vision_evaluation.py +0 -10
  195. ml_tools/ML_vision_inference.py +0 -8
  196. ml_tools/ML_vision_models.py +0 -18
  197. ml_tools/SQL.py +0 -8
  198. ml_tools/_core/_ETL_cleaning.py +0 -694
  199. ml_tools/_core/_IO_tools.py +0 -498
  200. ml_tools/_core/_ML_callbacks.py +0 -702
  201. ml_tools/_core/_ML_configuration.py +0 -1332
  202. ml_tools/_core/_ML_configuration_pytab.py +0 -102
  203. ml_tools/_core/_ML_evaluation.py +0 -867
  204. ml_tools/_core/_ML_evaluation_multi.py +0 -544
  205. ml_tools/_core/_ML_inference.py +0 -646
  206. ml_tools/_core/_ML_models.py +0 -668
  207. ml_tools/_core/_ML_models_pytab.py +0 -693
  208. ml_tools/_core/_ML_trainer.py +0 -2323
  209. ml_tools/_core/_ML_utilities.py +0 -886
  210. ml_tools/_core/_ML_vision_models.py +0 -644
  211. ml_tools/_core/_data_exploration.py +0 -1909
  212. ml_tools/_core/_optimization_tools.py +0 -493
  213. ml_tools/_core/_schema.py +0 -359
  214. ml_tools/plot_fonts.py +0 -8
  215. ml_tools/schema.py +0 -12
  216. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
  217. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
  218. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  219. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,245 @@
1
+ import polars as pl
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ from ..utilities import save_dataframe_filename, load_dataframe
6
+
7
+ from .._core import get_logger
8
+ from ..path_manager import make_fullpath
9
+
10
+ from ._clean_tools import save_unique_values
11
+
12
+
13
+ _LOGGER = get_logger("DragonCleaner")
14
+
15
+
16
+ __all__ = [
17
+ "DragonColumnCleaner",
18
+ "DragonDataFrameCleaner",
19
+ ]
20
+
21
+
22
+ class DragonColumnCleaner:
23
+ """
24
+ A configuration object that defines cleaning rules for a single Polars DataFrame column.
25
+
26
+ This class holds a dictionary of regex-to-replacement rules, the target column name,
27
+ and the case-sensitivity setting. It is intended to be used with the DragonDataFrameCleaner.
28
+
29
+ Notes:
30
+ - Define rules from most specific to more general to create a fallback system.
31
+ - Beware of chain replacements (rules matching strings that have already been
32
+ changed by a previous rule in the same cleaner).
33
+ """
34
+ def __init__(self,
35
+ column_name: str,
36
+ rules: Union[dict[str, Union[str, None]], dict[str, str]],
37
+ case_insensitive: bool = False):
38
+ """
39
+ Args:
40
+ column_name (str):
41
+ The name of the column to be cleaned.
42
+ rules (Dict[str, str | None]):
43
+ A dictionary of regex patterns to replacement strings.
44
+ - Replacement can be None to indicate that matching values should be converted to null.
45
+ - Can use backreferences (e.g., r'$1 $2') for captured groups. Note that Polars uses a '$' prefix for backreferences.
46
+ case_insensitive (bool):
47
+ If True, regex matching ignores case.
48
+
49
+ ## Usage Example
50
+
51
+ ```python
52
+ id_rules = {
53
+ # Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
54
+ r'ID[- ](\\d+)': r'ID:$1'
55
+ }
56
+
57
+ id_cleaner = DragonColumnCleaner(column_name='user_id', rules=id_rules)
58
+ # This object would then be passed to a DragonDataFrameCleaner.
59
+ ```
60
+ """
61
+ if not isinstance(column_name, str) or not column_name:
62
+ _LOGGER.error("The 'column_name' must be a non-empty string.")
63
+ raise TypeError()
64
+ if not isinstance(rules, dict):
65
+ _LOGGER.error("The 'rules' argument must be a dictionary.")
66
+ raise TypeError()
67
+ # validate rules
68
+ for pattern, replacement in rules.items():
69
+ if not isinstance(pattern, str):
70
+ _LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
71
+ raise TypeError()
72
+ if replacement is not None and not isinstance(replacement, str):
73
+ _LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
74
+ raise TypeError()
75
+
76
+ self.column_name = column_name
77
+ self.rules = rules
78
+ self.case_insensitive = case_insensitive
79
+
80
+ def preview(self,
81
+ csv_path: Union[str, Path],
82
+ report_dir: Union[str, Path],
83
+ add_value_separator: bool=False,
84
+ rule_batch_size: int = 150):
85
+ """
86
+ Generates a preview report of unique values in the specified column after applying the current cleaning rules.
87
+
88
+ Args:
89
+ csv_path (str | Path):
90
+ The path to the CSV file containing the data to clean.
91
+ report_dir (str | Path):
92
+ The directory where the preview report will be saved.
93
+ add_value_separator (bool):
94
+ If True, adds a separator line between each unique value in the report.
95
+ rule_batch_size (int):
96
+ Splits the regex rules into chunks of this size. Helps prevent memory errors.
97
+ """
98
+ # Load DataFrame
99
+ df, _ = load_dataframe(df_path=csv_path, use_columns=[self.column_name], kind="polars", all_strings=True)
100
+
101
+ preview_cleaner = DragonDataFrameCleaner(cleaners=[self])
102
+ df_preview = preview_cleaner.clean(df, rule_batch_size=rule_batch_size)
103
+
104
+ # Apply cleaning rules to a copy of the column for preview
105
+ save_unique_values(csv_path_or_df=df_preview,
106
+ output_dir=report_dir,
107
+ use_columns=[self.column_name],
108
+ verbose=False,
109
+ keep_column_order=False,
110
+ add_value_separator=add_value_separator)
111
+
112
+
113
+ class DragonDataFrameCleaner:
114
+ """
115
+ Orchestrates cleaning multiple columns in a Polars DataFrame.
116
+ """
117
+ def __init__(self, cleaners: list[DragonColumnCleaner]):
118
+ """
119
+ Takes a list of DragonColumnCleaner objects and applies their defined
120
+ rules to the corresponding columns of a DataFrame using high-performance
121
+ Polars expressions wit memory optimization.
122
+
123
+ Args:
124
+ cleaners (List[DragonColumnCleaner]):
125
+ A list of DragonColumnCleaner configuration objects.
126
+ """
127
+ if not isinstance(cleaners, list):
128
+ _LOGGER.error("The 'cleaners' argument must be a list of DragonColumnCleaner objects.")
129
+ raise TypeError()
130
+
131
+ seen_columns = set()
132
+ for cleaner in cleaners:
133
+ if not isinstance(cleaner, DragonColumnCleaner):
134
+ _LOGGER.error(f"All items in 'cleaners' list must be DragonColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
135
+ raise TypeError()
136
+ if cleaner.column_name in seen_columns:
137
+ _LOGGER.error(f"Duplicate DragonColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
138
+ raise ValueError()
139
+ seen_columns.add(cleaner.column_name)
140
+
141
+ self.cleaners = cleaners
142
+
143
+ def clean(self, df: Union[pl.DataFrame, pl.LazyFrame],
144
+ rule_batch_size: int = 150) -> pl.DataFrame:
145
+ """
146
+ Applies cleaning rules. Supports Lazy execution to handle OOM issues.
147
+
148
+ Args:
149
+ df (pl.DataFrame | pl.LazyFrame):
150
+ The data to clean.
151
+ rule_batch_size (int):
152
+ Splits the regex rules into chunks of this size. Helps prevent memory errors.
153
+
154
+ Returns:
155
+ pl.DataFrame: The cleaned, collected DataFrame.
156
+ """
157
+ # 1. Validate Columns (Only if eager, or simple schema check if lazy)
158
+ # Note: For LazyFrames, we assume columns exist or let it fail at collection.
159
+ if isinstance(df, pl.DataFrame):
160
+ df_cols = set(df.columns)
161
+ rule_cols = {c.column_name for c in self.cleaners}
162
+ missing = rule_cols - df_cols
163
+ if missing:
164
+ _LOGGER.error(f"The following columns specified in cleaners are missing from the DataFrame: {missing}")
165
+ raise ValueError()
166
+
167
+
168
+ # lazy internally
169
+ lf = df.lazy()
170
+ else:
171
+ # It should be a LazyFrame, check type
172
+ if not isinstance(df, pl.LazyFrame):
173
+ _LOGGER.error("The 'df' argument must be a Polars DataFrame or LazyFrame.")
174
+ raise TypeError()
175
+ # It is already a LazyFrame
176
+ lf = df
177
+
178
+ # 2. Build Expression Chain
179
+ final_lf = lf
180
+
181
+ for cleaner in self.cleaners:
182
+ col_name = cleaner.column_name
183
+
184
+ # Get all rules as a list of items
185
+ all_rules = list(cleaner.rules.items())
186
+
187
+ # Process in batches of 'rule_batch_size'
188
+ for i in range(0, len(all_rules), rule_batch_size):
189
+ rule_batch = all_rules[i : i + rule_batch_size]
190
+
191
+ # Start expression for this batch
192
+ col_expr = pl.col(col_name).cast(pl.String)
193
+
194
+ for pattern, replacement in rule_batch:
195
+ final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
196
+
197
+ if replacement is None:
198
+ col_expr = pl.when(col_expr.str.contains(final_pattern)) \
199
+ .then(None) \
200
+ .otherwise(col_expr)
201
+ else:
202
+ col_expr = col_expr.str.replace_all(final_pattern, replacement)
203
+
204
+ # Apply this batch of rules to the LazyFrame
205
+ final_lf = final_lf.with_columns(col_expr.alias(col_name))
206
+
207
+ # 3. Collect Results
208
+ try:
209
+ return final_lf.collect(engine="streaming")
210
+ except Exception as e:
211
+ _LOGGER.error("An error occurred during the cleaning process.")
212
+ raise e
213
+
214
+ def load_clean_save(self,
215
+ input_filepath: Union[str,Path],
216
+ output_filepath: Union[str,Path],
217
+ rule_batch_size: int = 150):
218
+ """
219
+ This convenience method encapsulates the entire cleaning process into a
220
+ single call. It loads a DataFrame from a specified file, applies all
221
+ cleaning rules configured in the `DragonDataFrameCleaner` instance, and saves
222
+ the resulting cleaned DataFrame to a new file.
223
+
224
+ The method ensures that all data is loaded as string types to prevent
225
+ unintended type inference issues before cleaning operations are applied.
226
+
227
+ Args:
228
+ input_filepath (Union[str, Path]):
229
+ The path to the input data file.
230
+ output_filepath (Union[str, Path]):
231
+ The full path, where the cleaned data file will be saved.
232
+ rule_batch_size (int):
233
+ Splits the regex rules into chunks of this size. Helps prevent memory errors.
234
+ """
235
+ df, _ = load_dataframe(df_path=input_filepath, kind="polars", all_strings=True)
236
+
237
+ df_clean = self.clean(df=df, rule_batch_size=rule_batch_size)
238
+
239
+ if isinstance(output_filepath, str):
240
+ output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
241
+
242
+ save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
243
+
244
+ return None
245
+
@@ -0,0 +1,13 @@
1
+ from .._core import _imprimir_disponibles
2
+
3
+ _GRUPOS = [
4
+ "DragonColumnCleaner",
5
+ "DragonDataFrameCleaner",
6
+ "save_unique_values",
7
+ "basic_clean",
8
+ "basic_clean_drop",
9
+ "drop_macro_polars",
10
+ ]
11
+
12
+ def info():
13
+ _imprimir_disponibles(_GRUPOS)
@@ -1,6 +1,9 @@
1
- from ._core._ETL_engineering import (
2
- DragonTransformRecipe,
1
+ from ._dragon_engineering import (
3
2
  DragonProcessor,
3
+ DragonTransformRecipe,
4
+ )
5
+
6
+ from ._transforms import (
4
7
  BinaryTransformer,
5
8
  MultiBinaryDummifier,
6
9
  AutoDummifier,
@@ -15,10 +18,11 @@ from ._core._ETL_engineering import (
15
18
  RegexMapper,
16
19
  ValueBinner,
17
20
  DateFeatureExtractor,
18
- MolecularFormulaTransformer,
19
- info
21
+ MolecularFormulaTransformer
20
22
  )
21
23
 
24
+ from ._imprimir import info
25
+
22
26
 
23
27
  __all__ = [
24
28
  "DragonTransformRecipe",
@@ -0,0 +1,261 @@
1
+ import polars as pl
2
+ from pathlib import Path
3
+ from typing import Union, Optional, Any, Callable
4
+
5
+ from ..utilities import load_dataframe, save_dataframe_filename
6
+
7
+ from ..keys._keys import MagicWords
8
+ from ..path_manager import make_fullpath
9
+ from .._core import get_logger
10
+
11
+
12
+ _LOGGER = get_logger("DragonTransform")
13
+
14
+
15
+ __all__ = [
16
+ "DragonTransformRecipe",
17
+ "DragonProcessor",
18
+ ]
19
+
20
+
21
+ class DragonTransformRecipe:
22
+ """
23
+ A builder class for creating a data transformation recipe.
24
+
25
+ This class provides a structured way to define a series of transformation
26
+ steps, with validation performed at the time of addition. It is designed
27
+ to be passed to a `DragonProcessor`.
28
+
29
+ Use the method `add()` to add recipes.
30
+ """
31
+ def __init__(self):
32
+ self._steps: list[dict[str, Any]] = []
33
+
34
+ def add(
35
+ self,
36
+ input_col_name: str,
37
+ transform: Union[str, Callable],
38
+ output_col_names: Optional[Union[str, list[str]]] = None
39
+ ) -> "DragonTransformRecipe":
40
+ """
41
+ Adds a new transformation step to the recipe.
42
+
43
+ Args:
44
+ input_col_name: The name of the column from the source DataFrame.
45
+ output_col_names: The desired name(s) for the output column(s).
46
+ - A string for a 1-to-1 mapping.
47
+ - A list of strings for a 1-to-many mapping.
48
+ - A string prefix for 1-to-many mapping.
49
+ - If None, the input name is used for 1-to-1 transforms,
50
+ or the transformer's default names are used for 1-to-many.
51
+ transform: The transformation to apply:
52
+ - Use "rename" for simple column renaming
53
+ - If callable, must accept a `pl.Series` as the only parameter and return either a `pl.Series` or `pl.DataFrame`.
54
+
55
+ Returns:
56
+ The instance of the recipe itself to allow for method chaining.
57
+ """
58
+ # --- Validation ---
59
+ if not isinstance(input_col_name, str) or not input_col_name:
60
+ _LOGGER.error("'input_col' must be a non-empty string.")
61
+ raise TypeError()
62
+
63
+ if transform == MagicWords.RENAME:
64
+ if not isinstance(output_col_names, str):
65
+ _LOGGER.error("For a RENAME operation, 'output_col' must be a string.")
66
+ raise TypeError()
67
+ elif not isinstance(transform, Callable):
68
+ _LOGGER.error(f"'transform' must be a callable function or the string '{MagicWords.RENAME}'.")
69
+ raise TypeError()
70
+
71
+ # --- Add Step ---
72
+ step = {
73
+ "input_col": input_col_name,
74
+ "output_col": output_col_names,
75
+ "transform": transform,
76
+ }
77
+ self._steps.append(step)
78
+ return self # Allow chaining: recipe.add(...).add(...)
79
+
80
+ def __iter__(self):
81
+ """Allows the class to be iterated over, like a list."""
82
+ return iter(self._steps)
83
+
84
+ def __len__(self):
85
+ """Allows the len() function to be used on an instance."""
86
+ return len(self._steps)
87
+
88
+
89
+ class DragonProcessor:
90
+ """
91
+ Transforms a Polars DataFrame based on a provided `DragonTransformRecipe` object.
92
+
93
+ Use the methods `transform()` or `load_transform_save()`.
94
+ """
95
+ def __init__(self, recipe: DragonTransformRecipe):
96
+ """
97
+ Initializes the DragonProcessor with a transformation recipe.
98
+
99
+ Args:
100
+ recipe: An instance of the `DragonTransformRecipe` class that has
101
+ been populated with transformation steps.
102
+ """
103
+ if not isinstance(recipe, DragonTransformRecipe):
104
+ _LOGGER.error("The recipe must be an instance of DragonTransformRecipe.")
105
+ raise TypeError()
106
+ if len(recipe) == 0:
107
+ _LOGGER.error("The recipe cannot be empty.")
108
+ raise ValueError()
109
+ self._recipe = recipe
110
+
111
+ def transform(self, df: pl.DataFrame) -> pl.DataFrame:
112
+ """
113
+ Applies the transformation recipe to the input DataFrame.
114
+ """
115
+ processed_columns = []
116
+ # Recipe object is iterable
117
+ for step in self._recipe:
118
+ input_col_name = step["input_col"]
119
+ output_col_spec = step["output_col"]
120
+ transform_action = step["transform"]
121
+
122
+ if input_col_name not in df.columns:
123
+ _LOGGER.error(f"Input column '{input_col_name}' not found in DataFrame.")
124
+ raise ValueError()
125
+
126
+ input_series = df.get_column(input_col_name)
127
+
128
+ if transform_action == MagicWords.RENAME:
129
+ processed_columns.append(input_series.alias(output_col_spec))
130
+ continue
131
+
132
+ if isinstance(transform_action, Callable):
133
+ result = transform_action(input_series)
134
+
135
+ if isinstance(result, pl.Series):
136
+ # Default to input name if spec is None
137
+ output_name = output_col_spec if output_col_spec is not None else input_col_name
138
+
139
+ if not isinstance(output_name, str):
140
+ _LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' must be a string or None.")
141
+ raise TypeError()
142
+ processed_columns.append(result.alias(output_name))
143
+
144
+ elif isinstance(result, pl.DataFrame):
145
+ # 1. Handle None in output names
146
+ if output_col_spec is None:
147
+ # Use the column names generated by the transformer directly
148
+ processed_columns.extend(result.get_columns())
149
+
150
+ # 2. Handle list-based renaming
151
+ elif isinstance(output_col_spec, list):
152
+ if len(result.columns) != len(output_col_spec):
153
+ _LOGGER.error(f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, but recipe specifies {len(output_col_spec)} output names.")
154
+ raise ValueError()
155
+
156
+ renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
157
+ processed_columns.extend(renamed_df.get_columns())
158
+
159
+ # 3. Global logic for adding a single prefix to all columns.
160
+ elif isinstance(output_col_spec, str):
161
+ prefix = output_col_spec
162
+ new_names = {}
163
+
164
+ for col in result.columns:
165
+ # Case 1: Transformer's output column name contains the input name.
166
+ # Action: Replace the input name with the desired prefix.
167
+ # Example: input='color', output='color_red', prefix='spec' -> 'spec_red'
168
+ # if input_col_name in col:
169
+ if col.startswith(input_col_name):
170
+ new_names[col] = col.replace(input_col_name, prefix, 1)
171
+
172
+ # Case 2: Transformer's output is an independent name.
173
+ # Action: Prepend the prefix to the output name.
174
+ # Example: input='ratio', output='A_B', prefix='spec' -> 'spec_A_B'
175
+ else:
176
+ new_names[col] = f"{prefix}_{col}"
177
+
178
+ renamed_df = result.rename(new_names)
179
+ processed_columns.extend(renamed_df.get_columns())
180
+
181
+
182
+ else:
183
+ _LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names, a string prefix, or None.")
184
+ raise TypeError()
185
+
186
+ else:
187
+ _LOGGER.error(f"Function for '{input_col_name}' returned an unexpected type: {type(result)}.")
188
+ raise TypeError()
189
+
190
+ else: # This case is unlikely due to builder validation.
191
+ _LOGGER.error(f"Invalid 'transform' action for '{input_col_name}': {transform_action}")
192
+ raise TypeError()
193
+
194
+ if not processed_columns:
195
+ _LOGGER.error("The transformation resulted in an empty DataFrame.")
196
+ return pl.DataFrame()
197
+
198
+ _LOGGER.info(f"Processed dataframe with {len(processed_columns)} columns.")
199
+
200
+ return pl.DataFrame(processed_columns)
201
+
202
+ def load_transform_save(self, input_path: Union[str,Path], output_path: Union[str,Path]):
203
+ """
204
+ Convenience wrapper for the transform method that includes automatic dataframe loading and saving.
205
+ """
206
+ # Validate paths
207
+ in_path = make_fullpath(input_path, enforce="file")
208
+ out_path = make_fullpath(output_path, make=True, enforce="file")
209
+
210
+ # load df
211
+ df, _ = load_dataframe(df_path=in_path, kind="polars", all_strings=True)
212
+
213
+ # Process
214
+ df_processed = self.transform(df)
215
+
216
+ # save processed df
217
+ save_dataframe_filename(df=df_processed, save_dir=out_path.parent, filename=out_path.name)
218
+
219
+ def __str__(self) -> str:
220
+ """
221
+ Provides a detailed, human-readable string representation of the
222
+ entire processing pipeline.
223
+ """
224
+ header = "DragonProcessor Pipeline"
225
+ divider = "-" * len(header)
226
+ num_steps = len(self._recipe)
227
+
228
+ lines = [
229
+ header,
230
+ divider,
231
+ f"Number of steps: {num_steps}\n"
232
+ ]
233
+
234
+ if num_steps == 0:
235
+ lines.append("No transformation steps defined.")
236
+ return "\n".join(lines)
237
+
238
+ for i, step in enumerate(self._recipe, 1):
239
+ transform_action = step["transform"]
240
+
241
+ # Get a clean name for the transformation action
242
+ if transform_action == MagicWords.RENAME: # "rename"
243
+ transform_name = "Rename"
244
+ else:
245
+ # This works for both functions and class instances
246
+ transform_name = type(transform_action).__name__
247
+
248
+ lines.append(f"[{i}] Input: '{step['input_col']}'")
249
+ lines.append(f" - Transform: {transform_name}")
250
+ lines.append(f" - Output(s): {step['output_col']}")
251
+ if i < num_steps:
252
+ lines.append("") # Add a blank line between steps
253
+
254
+ return "\n".join(lines)
255
+
256
+ def inspect(self) -> None:
257
+ """
258
+ Prints the detailed string representation of the pipeline to the console.
259
+ """
260
+ print(self)
261
+
@@ -0,0 +1,24 @@
1
+ from .._core import _imprimir_disponibles
2
+
3
+ _GRUPOS = [
4
+ "DragonTransformRecipe",
5
+ "DragonProcessor",
6
+ "BinaryTransformer",
7
+ "MultiBinaryDummifier",
8
+ "AutoDummifier",
9
+ "KeywordDummifier",
10
+ "NumberExtractor",
11
+ "MultiNumberExtractor",
12
+ "TemperatureExtractor",
13
+ "MultiTemperatureExtractor",
14
+ "RatioCalculator",
15
+ "TriRatioCalculator",
16
+ "CategoryMapper",
17
+ "RegexMapper",
18
+ "ValueBinner",
19
+ "DateFeatureExtractor",
20
+ "MolecularFormulaTransformer"
21
+ ]
22
+
23
+ def info():
24
+ _imprimir_disponibles(_GRUPOS)