dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
  2. dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
  3. ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
  4. ml_tools/ETL_cleaning/_basic_clean.py +351 -0
  5. ml_tools/ETL_cleaning/_clean_tools.py +128 -0
  6. ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
  7. ml_tools/ETL_cleaning/_imprimir.py +13 -0
  8. ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
  9. ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
  10. ml_tools/ETL_engineering/_imprimir.py +24 -0
  11. ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
  12. ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
  13. ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
  14. ml_tools/GUI_tools/_imprimir.py +12 -0
  15. ml_tools/IO_tools/_IO_loggers.py +235 -0
  16. ml_tools/IO_tools/_IO_save_load.py +151 -0
  17. ml_tools/IO_tools/_IO_utils.py +140 -0
  18. ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
  19. ml_tools/IO_tools/_imprimir.py +14 -0
  20. ml_tools/MICE/_MICE_imputation.py +132 -0
  21. ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
  22. ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
  23. ml_tools/MICE/_imprimir.py +11 -0
  24. ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
  25. ml_tools/ML_callbacks/_base.py +101 -0
  26. ml_tools/ML_callbacks/_checkpoint.py +232 -0
  27. ml_tools/ML_callbacks/_early_stop.py +208 -0
  28. ml_tools/ML_callbacks/_imprimir.py +12 -0
  29. ml_tools/ML_callbacks/_scheduler.py +197 -0
  30. ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
  31. ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
  32. ml_tools/ML_chain/_dragon_chain.py +140 -0
  33. ml_tools/ML_chain/_imprimir.py +11 -0
  34. ml_tools/ML_configuration/__init__.py +90 -0
  35. ml_tools/ML_configuration/_base_model_config.py +69 -0
  36. ml_tools/ML_configuration/_finalize.py +366 -0
  37. ml_tools/ML_configuration/_imprimir.py +47 -0
  38. ml_tools/ML_configuration/_metrics.py +593 -0
  39. ml_tools/ML_configuration/_models.py +206 -0
  40. ml_tools/ML_configuration/_training.py +124 -0
  41. ml_tools/ML_datasetmaster/__init__.py +28 -0
  42. ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
  43. ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
  44. ml_tools/ML_datasetmaster/_imprimir.py +15 -0
  45. ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
  46. ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
  47. ml_tools/ML_evaluation/__init__.py +53 -0
  48. ml_tools/ML_evaluation/_classification.py +629 -0
  49. ml_tools/ML_evaluation/_feature_importance.py +409 -0
  50. ml_tools/ML_evaluation/_imprimir.py +25 -0
  51. ml_tools/ML_evaluation/_loss.py +92 -0
  52. ml_tools/ML_evaluation/_regression.py +273 -0
  53. ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
  54. ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
  55. ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
  56. ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
  57. ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
  58. ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
  59. ml_tools/ML_finalize_handler/__init__.py +10 -0
  60. ml_tools/ML_finalize_handler/_imprimir.py +8 -0
  61. ml_tools/ML_inference/__init__.py +22 -0
  62. ml_tools/ML_inference/_base_inference.py +166 -0
  63. ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
  64. ml_tools/ML_inference/_dragon_inference.py +332 -0
  65. ml_tools/ML_inference/_imprimir.py +11 -0
  66. ml_tools/ML_inference/_multi_inference.py +180 -0
  67. ml_tools/ML_inference_sequence/__init__.py +10 -0
  68. ml_tools/ML_inference_sequence/_imprimir.py +8 -0
  69. ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
  70. ml_tools/ML_inference_vision/__init__.py +10 -0
  71. ml_tools/ML_inference_vision/_imprimir.py +8 -0
  72. ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
  73. ml_tools/ML_models/__init__.py +32 -0
  74. ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
  75. ml_tools/ML_models/_base_mlp_attention.py +198 -0
  76. ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
  77. ml_tools/ML_models/_dragon_tabular.py +248 -0
  78. ml_tools/ML_models/_imprimir.py +18 -0
  79. ml_tools/ML_models/_mlp_attention.py +134 -0
  80. ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
  81. ml_tools/ML_models_sequence/__init__.py +10 -0
  82. ml_tools/ML_models_sequence/_imprimir.py +8 -0
  83. ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
  84. ml_tools/ML_models_vision/__init__.py +29 -0
  85. ml_tools/ML_models_vision/_base_wrapper.py +254 -0
  86. ml_tools/ML_models_vision/_image_classification.py +182 -0
  87. ml_tools/ML_models_vision/_image_segmentation.py +108 -0
  88. ml_tools/ML_models_vision/_imprimir.py +16 -0
  89. ml_tools/ML_models_vision/_object_detection.py +135 -0
  90. ml_tools/ML_optimization/__init__.py +21 -0
  91. ml_tools/ML_optimization/_imprimir.py +13 -0
  92. ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
  93. ml_tools/ML_optimization/_single_dragon.py +203 -0
  94. ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
  95. ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
  96. ml_tools/ML_scaler/__init__.py +10 -0
  97. ml_tools/ML_scaler/_imprimir.py +8 -0
  98. ml_tools/ML_trainer/__init__.py +20 -0
  99. ml_tools/ML_trainer/_base_trainer.py +297 -0
  100. ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
  101. ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
  102. ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
  103. ml_tools/ML_trainer/_imprimir.py +10 -0
  104. ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
  105. ml_tools/ML_utilities/_artifact_finder.py +382 -0
  106. ml_tools/ML_utilities/_imprimir.py +16 -0
  107. ml_tools/ML_utilities/_inspection.py +325 -0
  108. ml_tools/ML_utilities/_train_tools.py +205 -0
  109. ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
  110. ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
  111. ml_tools/ML_vision_transformers/_imprimir.py +14 -0
  112. ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
  113. ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
  114. ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
  115. ml_tools/PSO_optimization/_imprimir.py +10 -0
  116. ml_tools/SQL/__init__.py +7 -0
  117. ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
  118. ml_tools/SQL/_imprimir.py +8 -0
  119. ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
  120. ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
  121. ml_tools/VIF/_imprimir.py +10 -0
  122. ml_tools/_core/__init__.py +7 -1
  123. ml_tools/_core/_logger.py +8 -18
  124. ml_tools/_core/_schema_load_ops.py +43 -0
  125. ml_tools/_core/_script_info.py +2 -2
  126. ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
  127. ml_tools/data_exploration/_analysis.py +214 -0
  128. ml_tools/data_exploration/_cleaning.py +566 -0
  129. ml_tools/data_exploration/_features.py +583 -0
  130. ml_tools/data_exploration/_imprimir.py +32 -0
  131. ml_tools/data_exploration/_plotting.py +487 -0
  132. ml_tools/data_exploration/_schema_ops.py +176 -0
  133. ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
  134. ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
  135. ml_tools/ensemble_evaluation/_imprimir.py +14 -0
  136. ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
  137. ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
  138. ml_tools/ensemble_inference/_imprimir.py +9 -0
  139. ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
  140. ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
  141. ml_tools/ensemble_learning/_imprimir.py +10 -0
  142. ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
  143. ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
  144. ml_tools/excel_handler/_imprimir.py +13 -0
  145. ml_tools/{keys.py → keys/__init__.py} +4 -1
  146. ml_tools/keys/_imprimir.py +11 -0
  147. ml_tools/{_core → keys}/_keys.py +2 -0
  148. ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
  149. ml_tools/math_utilities/_imprimir.py +11 -0
  150. ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
  151. ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
  152. ml_tools/optimization_tools/_imprimir.py +13 -0
  153. ml_tools/optimization_tools/_optimization_bounds.py +236 -0
  154. ml_tools/optimization_tools/_optimization_plots.py +218 -0
  155. ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
  156. ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
  157. ml_tools/path_manager/_imprimir.py +15 -0
  158. ml_tools/path_manager/_path_tools.py +346 -0
  159. ml_tools/plot_fonts/__init__.py +8 -0
  160. ml_tools/plot_fonts/_imprimir.py +8 -0
  161. ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
  162. ml_tools/schema/__init__.py +15 -0
  163. ml_tools/schema/_feature_schema.py +223 -0
  164. ml_tools/schema/_gui_schema.py +191 -0
  165. ml_tools/schema/_imprimir.py +10 -0
  166. ml_tools/{serde.py → serde/__init__.py} +4 -2
  167. ml_tools/serde/_imprimir.py +10 -0
  168. ml_tools/{_core → serde}/_serde.py +3 -8
  169. ml_tools/{utilities.py → utilities/__init__.py} +11 -6
  170. ml_tools/utilities/_imprimir.py +18 -0
  171. ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
  172. ml_tools/utilities/_utility_tools.py +192 -0
  173. dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
  174. ml_tools/ML_chaining_inference.py +0 -8
  175. ml_tools/ML_configuration.py +0 -86
  176. ml_tools/ML_configuration_pytab.py +0 -14
  177. ml_tools/ML_datasetmaster.py +0 -10
  178. ml_tools/ML_evaluation.py +0 -16
  179. ml_tools/ML_evaluation_multi.py +0 -12
  180. ml_tools/ML_finalize_handler.py +0 -8
  181. ml_tools/ML_inference.py +0 -12
  182. ml_tools/ML_models.py +0 -14
  183. ml_tools/ML_models_advanced.py +0 -14
  184. ml_tools/ML_models_pytab.py +0 -14
  185. ml_tools/ML_optimization.py +0 -14
  186. ml_tools/ML_optimization_pareto.py +0 -8
  187. ml_tools/ML_scaler.py +0 -8
  188. ml_tools/ML_sequence_datasetmaster.py +0 -8
  189. ml_tools/ML_sequence_evaluation.py +0 -10
  190. ml_tools/ML_sequence_inference.py +0 -8
  191. ml_tools/ML_sequence_models.py +0 -8
  192. ml_tools/ML_trainer.py +0 -12
  193. ml_tools/ML_vision_datasetmaster.py +0 -12
  194. ml_tools/ML_vision_evaluation.py +0 -10
  195. ml_tools/ML_vision_inference.py +0 -8
  196. ml_tools/ML_vision_models.py +0 -18
  197. ml_tools/SQL.py +0 -8
  198. ml_tools/_core/_ETL_cleaning.py +0 -694
  199. ml_tools/_core/_IO_tools.py +0 -498
  200. ml_tools/_core/_ML_callbacks.py +0 -702
  201. ml_tools/_core/_ML_configuration.py +0 -1332
  202. ml_tools/_core/_ML_configuration_pytab.py +0 -102
  203. ml_tools/_core/_ML_evaluation.py +0 -867
  204. ml_tools/_core/_ML_evaluation_multi.py +0 -544
  205. ml_tools/_core/_ML_inference.py +0 -646
  206. ml_tools/_core/_ML_models.py +0 -668
  207. ml_tools/_core/_ML_models_pytab.py +0 -693
  208. ml_tools/_core/_ML_trainer.py +0 -2323
  209. ml_tools/_core/_ML_utilities.py +0 -886
  210. ml_tools/_core/_ML_vision_models.py +0 -644
  211. ml_tools/_core/_data_exploration.py +0 -1909
  212. ml_tools/_core/_optimization_tools.py +0 -493
  213. ml_tools/_core/_schema.py +0 -359
  214. ml_tools/plot_fonts.py +0 -8
  215. ml_tools/schema.py +0 -12
  216. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
  217. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
  218. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  219. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,351 @@
1
+ import polars as pl
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ from ..data_exploration import show_null_columns
6
+ from ..utilities import save_dataframe_filename, load_dataframe
7
+
8
+ from ..path_manager import make_fullpath
9
+ from .._core import get_logger
10
+
11
+ from ._dragon_cleaner import DragonColumnCleaner, DragonDataFrameCleaner
12
+
13
+ _LOGGER = get_logger("ETL Basic Clean")
14
+
15
+
16
+ __all__ = [
17
+ "basic_clean",
18
+ "basic_clean_drop",
19
+ "drop_macro_polars",
20
+ ]
21
+
22
+
23
+ ########## Basic cleaners #############
24
+ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
25
+ # Cleaning rules
26
+ cleaning_rules = {
27
+ # 1. Comprehensive Punctuation & Symbol Normalization
28
+ # Remove invisible control characters
29
+ r'\p{C}+': '',
30
+
31
+ # Full-width to half-width
32
+ # Numbers
33
+ '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
34
+ '5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
35
+ # Superscripts & Subscripts
36
+ '¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5',
37
+ '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
38
+ '₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
39
+ '₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
40
+ '⁺': '', '⁻': '', '₊': '', '₋': '',
41
+ # Uppercase Alphabet
42
+ 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
43
+ 'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
44
+ 'M': 'M', 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R',
45
+ 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X',
46
+ 'Y': 'Y', 'Z': 'Z',
47
+ # Lowercase Alphabet
48
+ 'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f',
49
+ 'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l',
50
+ 'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r',
51
+ 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
52
+ 'y': 'y', 'z': 'z',
53
+ # Punctuation
54
+ '》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']', '∼': '~',
55
+ '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
56
+ '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '', '⋅': '',
57
+ '¯': '-', '_': '-',
58
+
59
+ # Commas (avoid commas in entries)
60
+ ',': ';',
61
+ ',': ';',
62
+ '、':';',
63
+
64
+ # Others
65
+ 'σ': '',
66
+ '□': '',
67
+ '©': '',
68
+ '®': '',
69
+ '™': '',
70
+ r'[°˚]': '',
71
+
72
+ # Replace special characters in entries
73
+ r'\\': '_',
74
+
75
+ # Typographical standardization
76
+ # Unify various dashes and hyphens to a standard hyphen
77
+ r'[—–―]': '-',
78
+ r'−': '-',
79
+ # remove various quote types
80
+ r'[“”"]': '',
81
+ r"[‘’′']": '',
82
+
83
+ # Collapse repeating punctuation
84
+ r'\.{2,}': '.', # Replace two or more dots with a single dot
85
+ r'\?{2,}': '?', # Replace two or more question marks with a single question mark
86
+ r'!{2,}': '!', # Replace two or more exclamation marks with a single one
87
+ r';{2,}': ';',
88
+ r'-{2,}': '-',
89
+ r'/{2,}': '/',
90
+ r'%{2,}': '%',
91
+ r'&{2,}': '&',
92
+
93
+ # 2. Internal Whitespace Consolidation
94
+ # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
95
+ r'\s+': ' ',
96
+
97
+ # 3. Leading/Trailing Whitespace Removal
98
+ # Strip any whitespace from the beginning or end of the string
99
+ r'^\s+|\s+$': '',
100
+
101
+ # 4. Textual Null Standardization (New Step)
102
+ # Convert common null-like text to actual nulls.
103
+ r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
104
+
105
+ # 5. Final Nullification of Empty Strings
106
+ # After all cleaning, if a string is now empty, convert it to a null
107
+ r'^\s*$': None,
108
+ r'^$': None,
109
+ }
110
+
111
+ # Clean data
112
+ try:
113
+ # Create a cleaner for every column in the dataframe
114
+ all_columns = df_in.columns
115
+ column_cleaners = [
116
+ DragonColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
117
+ ]
118
+
119
+ # Instantiate and run the main dataframe cleaner
120
+ df_cleaner = DragonDataFrameCleaner(cleaners=column_cleaners)
121
+ df_cleaned = df_cleaner.clean(df_in)
122
+
123
+ # apply lowercase to all string columns
124
+ if all_lowercase:
125
+ df_final = df_cleaned.with_columns(
126
+ pl.col(pl.String).str.to_lowercase()
127
+ )
128
+ else:
129
+ df_final = df_cleaned
130
+
131
+ except Exception as e:
132
+ _LOGGER.error(f"An error occurred during the cleaning process.")
133
+ raise e
134
+ else:
135
+ return df_final
136
+
137
+
138
+ def _local_path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
139
+ # Handle paths
140
+ input_path = make_fullpath(path_in, enforce="file")
141
+
142
+ parent_dir = make_fullpath(Path(path_out).parent, make=True, enforce="directory")
143
+ output_path = parent_dir / Path(path_out).name
144
+
145
+ return input_path, output_path
146
+
147
+
148
+ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=False):
149
+ """
150
+ Performs a comprehensive, standardized cleaning on all columns of a CSV file.
151
+
152
+ The cleaning process includes:
153
+ - Normalizing full-width and typographical punctuation to standard equivalents.
154
+ - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
155
+ - Stripping any leading or trailing whitespace.
156
+ - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
157
+ - Converting strings that become empty after cleaning into true null values.
158
+ - Normalizing all text to lowercase (Optional).
159
+
160
+ Args:
161
+ input_filepath (str | Path):
162
+ The path to the source CSV file to be cleaned.
163
+ output_filepath (str | Path):
164
+ The path to save the cleaned CSV file.
165
+ all_lowercase (bool):
166
+ Whether to normalize all text to lowercase.
167
+
168
+ """
169
+ # Handle paths
170
+ input_path, output_path = _local_path_manager(path_in=input_filepath, path_out=output_filepath)
171
+
172
+ # load polars df
173
+ df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
174
+
175
+ # CLEAN
176
+ df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
177
+
178
+ # Save cleaned dataframe
179
+ save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
180
+
181
+ _LOGGER.info(f"Data successfully cleaned.")
182
+
183
+
184
+ def basic_clean_drop(input_filepath: Union[str,Path],
185
+ output_filepath: Union[str,Path],
186
+ log_directory: Union[str,Path],
187
+ targets: list[str],
188
+ skip_targets: bool=False,
189
+ threshold: float=0.8,
190
+ all_lowercase: bool=False):
191
+ """
192
+ Performs standardized cleaning followed by iterative removal of rows and
193
+ columns with excessive missing data.
194
+
195
+ This function combines the functionality of `basic_clean` and `drop_macro_polars`. It first
196
+ applies a comprehensive normalization process to all columns in the input CSV file.
197
+ Then it applies iterative row and column dropping to remove redundant or incomplete data.
198
+
199
+ Args:
200
+ input_filepath (str | Path):
201
+ The path to the source CSV file to be cleaned.
202
+ output_filepath (str | Path):
203
+ The path to save the fully cleaned CSV file after cleaning
204
+ and missing-data-based pruning.
205
+ log_directory (str | Path):
206
+ Path to the directory where missing data reports will be stored.
207
+ targets (list[str]):
208
+ A list of column names to be treated as target variables.
209
+ This list guides the row-dropping logic.
210
+ skip_targets (bool):
211
+ If True, the columns listed in `targets` will be exempt from being dropped,
212
+ even if they exceed the missing data threshold.
213
+ threshold (float):
214
+ The proportion of missing data required to drop a row or column.
215
+ For example, 0.8 means a row/column will be dropped if 80% or more
216
+ of its data is missing.
217
+ all_lowercase (bool):
218
+ Whether to normalize all text to lowercase.
219
+ """
220
+ # handle log path
221
+ log_path = make_fullpath(log_directory, make=True, enforce="directory")
222
+
223
+ # Handle df paths
224
+ input_path, output_path = _local_path_manager(path_in=input_filepath, path_out=output_filepath)
225
+
226
+ # load polars df
227
+ df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
228
+
229
+ # CLEAN
230
+ df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
231
+
232
+ # Drop macro (Polars implementation)
233
+ df_final = drop_macro_polars(df=df_cleaned,
234
+ log_directory=log_path,
235
+ targets=targets,
236
+ skip_targets=skip_targets,
237
+ threshold=threshold)
238
+
239
+ # Save cleaned dataframe
240
+ save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
241
+
242
+ _LOGGER.info(f"Data successfully cleaned.")
243
+
244
+
245
+ ########## EXTRACT and CLEAN ##########
246
+ def _generate_null_report(df: pl.DataFrame, save_dir: Path, filename: str):
247
+ """
248
+ Internal helper to generate and save a CSV report of missing data percentages using Polars.
249
+ """
250
+ total_rows = df.height
251
+ if total_rows == 0:
252
+ return
253
+
254
+ null_stats = df.null_count()
255
+
256
+ # Construct a report DataFrame
257
+ report = pl.DataFrame({
258
+ "column": df.columns,
259
+ "null_count": null_stats.transpose().to_series(),
260
+ }).with_columns(
261
+ (pl.col("null_count") / total_rows * 100).round(2).alias("missing_percent")
262
+ ).sort("missing_percent", descending=True)
263
+
264
+ save_dataframe_filename(df=report, save_dir=save_dir, filename=filename)
265
+
266
+
267
+ def drop_macro_polars(df: pl.DataFrame,
268
+ log_directory: Path,
269
+ targets: list[str],
270
+ skip_targets: bool,
271
+ threshold: float) -> pl.DataFrame:
272
+ """
273
+ High-performance implementation of iterative row/column pruning using Polars.
274
+ Includes temporary Pandas conversion for visualization.
275
+ """
276
+ df_clean = df.clone()
277
+
278
+ # --- Helper to generate plot safely ---
279
+ def _plot_safe(df_pl: pl.DataFrame, filename: str):
280
+ try:
281
+ # converting to pandas just for the plot
282
+ # use_pyarrow_extension_array=True is faster
283
+ df_pd = df_pl.to_pandas(use_pyarrow_extension_array=True)
284
+ show_null_columns(df_pd, plot_to_dir=log_directory, plot_filename=filename, use_all_columns=True)
285
+ except Exception as e:
286
+ _LOGGER.warning(f"Skipping plot generation due to error: {e}")
287
+
288
+ # 1. Log Initial State
289
+ _generate_null_report(df_clean, log_directory, "Missing_Data_Original")
290
+ _plot_safe(df_clean, "Original")
291
+
292
+ master = True
293
+ while master:
294
+ initial_rows, initial_cols = df_clean.shape
295
+
296
+ # --- A. Drop Constant Columns ---
297
+ # Keep columns where n_unique > 1.
298
+ # Note: n_unique in Polars ignores nulls by default (similar to pandas dropna=True).
299
+ # We assume if a column is all nulls, it should also be dropped (n_unique=0).
300
+ cols_to_keep = [
301
+ col for col in df_clean.columns
302
+ if df_clean[col].n_unique() > 1
303
+ ]
304
+ df_clean = df_clean.select(cols_to_keep)
305
+
306
+ # --- B. Drop Rows (Targets) ---
307
+ # Drop rows where ALL target columns are null
308
+ valid_targets = [t for t in targets if t in df_clean.columns]
309
+ if valid_targets:
310
+ df_clean = df_clean.filter(
311
+ ~pl.all_horizontal(pl.col(valid_targets).is_null())
312
+ )
313
+
314
+ # --- C. Drop Rows (Features Threshold) ---
315
+ # Drop rows where missing data fraction in FEATURE columns > threshold
316
+ feature_cols = [c for c in df_clean.columns if c not in valid_targets]
317
+ if feature_cols:
318
+ # We want to KEEP rows where (null_count / total_features) <= threshold
319
+ df_clean = df_clean.filter(
320
+ (pl.sum_horizontal(pl.col(feature_cols).is_null()) / len(feature_cols)) <= threshold
321
+ )
322
+
323
+ # --- D. Drop Columns (Threshold) ---
324
+ # Drop columns where missing data fraction > threshold
325
+ current_height = df_clean.height
326
+ if current_height > 0:
327
+ null_counts = df_clean.null_count().row(0) # tuple of counts
328
+ cols_to_drop = []
329
+
330
+ for col_idx, col_name in enumerate(df_clean.columns):
331
+ # Check if we should skip this column (if it's a target and skip_targets=True)
332
+ if skip_targets and col_name in valid_targets:
333
+ continue
334
+
335
+ missing_frac = null_counts[col_idx] / current_height
336
+ if missing_frac > threshold:
337
+ cols_to_drop.append(col_name)
338
+
339
+ if cols_to_drop:
340
+ df_clean = df_clean.drop(cols_to_drop)
341
+
342
+ # --- E. Check Convergence ---
343
+ remaining_rows, remaining_cols = df_clean.shape
344
+ if remaining_rows >= initial_rows and remaining_cols >= initial_cols:
345
+ master = False
346
+
347
+ # 2. Log Final State
348
+ _generate_null_report(df_clean, log_directory, "Missing_Data_Processed")
349
+ _plot_safe(df_clean, "Processed")
350
+
351
+ return df_clean
@@ -0,0 +1,128 @@
1
+ import polars as pl
2
+ from pathlib import Path
3
+ from typing import Union, Optional
4
+
5
+ from ..utilities import load_dataframe
6
+
7
+ from ..path_manager import sanitize_filename, make_fullpath
8
+ from .._core import get_logger
9
+
10
+
11
+ _LOGGER = get_logger("ETL Clean Tools")
12
+
13
+
14
+ __all__ = [
15
+ "save_unique_values",
16
+ ]
17
+
18
+
19
+ ################ Unique Values per column #################
20
+ def save_unique_values(csv_path_or_df: Union[str, Path, pl.DataFrame],
21
+ output_dir: Union[str, Path],
22
+ use_columns: Optional[list[str]] = None,
23
+ verbose: bool=False,
24
+ keep_column_order: bool = True,
25
+ add_value_separator: bool = False) -> None:
26
+ """
27
+ Loads a CSV file or Polars DataFrame, then analyzes it and saves the unique non-null values
28
+ from each column into a separate text file exactly as they appear.
29
+
30
+ This is useful for understanding the raw categories or range of values
31
+ within a dataset before and after cleaning.
32
+
33
+ Args:
34
+ csv_path_or_df (str | Path | pl.DataFrame):
35
+ The file path to the input CSV file or a Polars DataFrame.
36
+ output_dir (str | Path):
37
+ The path to the directory where the .txt files will be saved.
38
+ The directory will be created if it does not exist.
39
+ keep_column_order (bool):
40
+ If True, prepends a numeric prefix to each
41
+ output filename to maintain the original column order.
42
+ add_value_separator (bool):
43
+ If True, adds a separator line between each unique value.
44
+ use_columns (List[str] | None):
45
+ If provided, only these columns will be processed. If None, all columns will be processed.
46
+ verbose (bool):
47
+ If True, prints the number of unique values saved for each column.
48
+ """
49
+ # 1 Handle input DataFrame or path
50
+ if isinstance(csv_path_or_df, pl.DataFrame):
51
+ df = csv_path_or_df
52
+ if use_columns is not None:
53
+ # Validate columns exist
54
+ valid_cols = [c for c in use_columns if c in df.columns]
55
+ if not valid_cols:
56
+ _LOGGER.error("None of the specified columns in 'use_columns' exist in the provided DataFrame.")
57
+ raise ValueError()
58
+ df = df.select(valid_cols)
59
+ else:
60
+ csv_path = make_fullpath(input_path=csv_path_or_df, enforce="file")
61
+ df = load_dataframe(df_path=csv_path, use_columns=use_columns, kind="polars", all_strings=True)[0]
62
+
63
+ output_dir = make_fullpath(input_path=output_dir, make=True, enforce='directory')
64
+
65
+ if df.height == 0:
66
+ _LOGGER.warning("The input DataFrame is empty. No unique values to save.")
67
+ return
68
+
69
+ # --- 2. Process Each Column ---
70
+ counter = 0
71
+
72
+ # Iterate over columns using Polars methods
73
+ for i, column_name in enumerate(df.columns):
74
+ try:
75
+ col_expr = pl.col(column_name)
76
+
77
+ # Check if the column is string-based (String or Utf8)
78
+ dtype = df.schema[column_name]
79
+ if dtype in (pl.String, pl.Utf8):
80
+ # Filter out actual empty strings AND whitespace-only strings
81
+ dataset = df.select(col_expr).filter(
82
+ col_expr.str.strip_chars().str.len_chars() > 0
83
+ )
84
+ else:
85
+ dataset = df.select(col_expr)
86
+
87
+ # Efficiently get unique non-null values and sort them
88
+ unique_series = dataset.drop_nulls().unique().sort(column_name)
89
+
90
+ # Convert to a python list for writing
91
+ sorted_uniques = unique_series.to_series().to_list()
92
+
93
+ except Exception:
94
+ _LOGGER.error(f"Could not process column '{column_name}'.")
95
+ continue
96
+
97
+ if not sorted_uniques:
98
+ _LOGGER.warning(f"Column '{column_name}' has no unique non-null values. Skipping.")
99
+ continue
100
+
101
+ # --- 3. Filename Generation ---
102
+ sanitized_name = sanitize_filename(column_name)
103
+ if not sanitized_name.strip('_'):
104
+ sanitized_name = f'column_{i}'
105
+
106
+ prefix = f"{i + 1}_" if keep_column_order else ''
107
+ file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
108
+
109
+ # --- 4. Write to File ---
110
+ try:
111
+ with open(file_path, 'w', encoding='utf-8') as f:
112
+ f.write(f"# Unique values for column: '{column_name}'\n")
113
+ f.write(f"# Total unique non-null values: {len(sorted_uniques)}\n")
114
+ f.write("-" * 30 + "\n")
115
+
116
+ for value in sorted_uniques:
117
+ f.write(f"{value}\n")
118
+ if add_value_separator:
119
+ f.write("-" * 30 + "\n")
120
+
121
+ except IOError:
122
+ _LOGGER.exception(f"Error writing to file {file_path}.")
123
+ else:
124
+ if verbose:
125
+ print(f" Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
126
+ counter += 1
127
+
128
+ _LOGGER.info(f"{counter} files of unique values created.")