dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
  2. dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
  3. ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
  4. ml_tools/ETL_cleaning/_basic_clean.py +351 -0
  5. ml_tools/ETL_cleaning/_clean_tools.py +128 -0
  6. ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
  7. ml_tools/ETL_cleaning/_imprimir.py +13 -0
  8. ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
  9. ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
  10. ml_tools/ETL_engineering/_imprimir.py +24 -0
  11. ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
  12. ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
  13. ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
  14. ml_tools/GUI_tools/_imprimir.py +12 -0
  15. ml_tools/IO_tools/_IO_loggers.py +235 -0
  16. ml_tools/IO_tools/_IO_save_load.py +151 -0
  17. ml_tools/IO_tools/_IO_utils.py +140 -0
  18. ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
  19. ml_tools/IO_tools/_imprimir.py +14 -0
  20. ml_tools/MICE/_MICE_imputation.py +132 -0
  21. ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
  22. ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
  23. ml_tools/MICE/_imprimir.py +11 -0
  24. ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
  25. ml_tools/ML_callbacks/_base.py +101 -0
  26. ml_tools/ML_callbacks/_checkpoint.py +232 -0
  27. ml_tools/ML_callbacks/_early_stop.py +208 -0
  28. ml_tools/ML_callbacks/_imprimir.py +12 -0
  29. ml_tools/ML_callbacks/_scheduler.py +197 -0
  30. ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
  31. ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
  32. ml_tools/ML_chain/_dragon_chain.py +140 -0
  33. ml_tools/ML_chain/_imprimir.py +11 -0
  34. ml_tools/ML_configuration/__init__.py +90 -0
  35. ml_tools/ML_configuration/_base_model_config.py +69 -0
  36. ml_tools/ML_configuration/_finalize.py +366 -0
  37. ml_tools/ML_configuration/_imprimir.py +47 -0
  38. ml_tools/ML_configuration/_metrics.py +593 -0
  39. ml_tools/ML_configuration/_models.py +206 -0
  40. ml_tools/ML_configuration/_training.py +124 -0
  41. ml_tools/ML_datasetmaster/__init__.py +28 -0
  42. ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
  43. ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
  44. ml_tools/ML_datasetmaster/_imprimir.py +15 -0
  45. ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
  46. ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
  47. ml_tools/ML_evaluation/__init__.py +53 -0
  48. ml_tools/ML_evaluation/_classification.py +629 -0
  49. ml_tools/ML_evaluation/_feature_importance.py +409 -0
  50. ml_tools/ML_evaluation/_imprimir.py +25 -0
  51. ml_tools/ML_evaluation/_loss.py +92 -0
  52. ml_tools/ML_evaluation/_regression.py +273 -0
  53. ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
  54. ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
  55. ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
  56. ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
  57. ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
  58. ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
  59. ml_tools/ML_finalize_handler/__init__.py +10 -0
  60. ml_tools/ML_finalize_handler/_imprimir.py +8 -0
  61. ml_tools/ML_inference/__init__.py +22 -0
  62. ml_tools/ML_inference/_base_inference.py +166 -0
  63. ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
  64. ml_tools/ML_inference/_dragon_inference.py +332 -0
  65. ml_tools/ML_inference/_imprimir.py +11 -0
  66. ml_tools/ML_inference/_multi_inference.py +180 -0
  67. ml_tools/ML_inference_sequence/__init__.py +10 -0
  68. ml_tools/ML_inference_sequence/_imprimir.py +8 -0
  69. ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
  70. ml_tools/ML_inference_vision/__init__.py +10 -0
  71. ml_tools/ML_inference_vision/_imprimir.py +8 -0
  72. ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
  73. ml_tools/ML_models/__init__.py +32 -0
  74. ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
  75. ml_tools/ML_models/_base_mlp_attention.py +198 -0
  76. ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
  77. ml_tools/ML_models/_dragon_tabular.py +248 -0
  78. ml_tools/ML_models/_imprimir.py +18 -0
  79. ml_tools/ML_models/_mlp_attention.py +134 -0
  80. ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
  81. ml_tools/ML_models_sequence/__init__.py +10 -0
  82. ml_tools/ML_models_sequence/_imprimir.py +8 -0
  83. ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
  84. ml_tools/ML_models_vision/__init__.py +29 -0
  85. ml_tools/ML_models_vision/_base_wrapper.py +254 -0
  86. ml_tools/ML_models_vision/_image_classification.py +182 -0
  87. ml_tools/ML_models_vision/_image_segmentation.py +108 -0
  88. ml_tools/ML_models_vision/_imprimir.py +16 -0
  89. ml_tools/ML_models_vision/_object_detection.py +135 -0
  90. ml_tools/ML_optimization/__init__.py +21 -0
  91. ml_tools/ML_optimization/_imprimir.py +13 -0
  92. ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
  93. ml_tools/ML_optimization/_single_dragon.py +203 -0
  94. ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
  95. ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
  96. ml_tools/ML_scaler/__init__.py +10 -0
  97. ml_tools/ML_scaler/_imprimir.py +8 -0
  98. ml_tools/ML_trainer/__init__.py +20 -0
  99. ml_tools/ML_trainer/_base_trainer.py +297 -0
  100. ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
  101. ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
  102. ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
  103. ml_tools/ML_trainer/_imprimir.py +10 -0
  104. ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
  105. ml_tools/ML_utilities/_artifact_finder.py +382 -0
  106. ml_tools/ML_utilities/_imprimir.py +16 -0
  107. ml_tools/ML_utilities/_inspection.py +325 -0
  108. ml_tools/ML_utilities/_train_tools.py +205 -0
  109. ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
  110. ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
  111. ml_tools/ML_vision_transformers/_imprimir.py +14 -0
  112. ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
  113. ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
  114. ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
  115. ml_tools/PSO_optimization/_imprimir.py +10 -0
  116. ml_tools/SQL/__init__.py +7 -0
  117. ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
  118. ml_tools/SQL/_imprimir.py +8 -0
  119. ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
  120. ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
  121. ml_tools/VIF/_imprimir.py +10 -0
  122. ml_tools/_core/__init__.py +7 -1
  123. ml_tools/_core/_logger.py +8 -18
  124. ml_tools/_core/_schema_load_ops.py +43 -0
  125. ml_tools/_core/_script_info.py +2 -2
  126. ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
  127. ml_tools/data_exploration/_analysis.py +214 -0
  128. ml_tools/data_exploration/_cleaning.py +566 -0
  129. ml_tools/data_exploration/_features.py +583 -0
  130. ml_tools/data_exploration/_imprimir.py +32 -0
  131. ml_tools/data_exploration/_plotting.py +487 -0
  132. ml_tools/data_exploration/_schema_ops.py +176 -0
  133. ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
  134. ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
  135. ml_tools/ensemble_evaluation/_imprimir.py +14 -0
  136. ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
  137. ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
  138. ml_tools/ensemble_inference/_imprimir.py +9 -0
  139. ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
  140. ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
  141. ml_tools/ensemble_learning/_imprimir.py +10 -0
  142. ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
  143. ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
  144. ml_tools/excel_handler/_imprimir.py +13 -0
  145. ml_tools/{keys.py → keys/__init__.py} +4 -1
  146. ml_tools/keys/_imprimir.py +11 -0
  147. ml_tools/{_core → keys}/_keys.py +2 -0
  148. ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
  149. ml_tools/math_utilities/_imprimir.py +11 -0
  150. ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
  151. ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
  152. ml_tools/optimization_tools/_imprimir.py +13 -0
  153. ml_tools/optimization_tools/_optimization_bounds.py +236 -0
  154. ml_tools/optimization_tools/_optimization_plots.py +218 -0
  155. ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
  156. ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
  157. ml_tools/path_manager/_imprimir.py +15 -0
  158. ml_tools/path_manager/_path_tools.py +346 -0
  159. ml_tools/plot_fonts/__init__.py +8 -0
  160. ml_tools/plot_fonts/_imprimir.py +8 -0
  161. ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
  162. ml_tools/schema/__init__.py +15 -0
  163. ml_tools/schema/_feature_schema.py +223 -0
  164. ml_tools/schema/_gui_schema.py +191 -0
  165. ml_tools/schema/_imprimir.py +10 -0
  166. ml_tools/{serde.py → serde/__init__.py} +4 -2
  167. ml_tools/serde/_imprimir.py +10 -0
  168. ml_tools/{_core → serde}/_serde.py +3 -8
  169. ml_tools/{utilities.py → utilities/__init__.py} +11 -6
  170. ml_tools/utilities/_imprimir.py +18 -0
  171. ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
  172. ml_tools/utilities/_utility_tools.py +192 -0
  173. dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
  174. ml_tools/ML_chaining_inference.py +0 -8
  175. ml_tools/ML_configuration.py +0 -86
  176. ml_tools/ML_configuration_pytab.py +0 -14
  177. ml_tools/ML_datasetmaster.py +0 -10
  178. ml_tools/ML_evaluation.py +0 -16
  179. ml_tools/ML_evaluation_multi.py +0 -12
  180. ml_tools/ML_finalize_handler.py +0 -8
  181. ml_tools/ML_inference.py +0 -12
  182. ml_tools/ML_models.py +0 -14
  183. ml_tools/ML_models_advanced.py +0 -14
  184. ml_tools/ML_models_pytab.py +0 -14
  185. ml_tools/ML_optimization.py +0 -14
  186. ml_tools/ML_optimization_pareto.py +0 -8
  187. ml_tools/ML_scaler.py +0 -8
  188. ml_tools/ML_sequence_datasetmaster.py +0 -8
  189. ml_tools/ML_sequence_evaluation.py +0 -10
  190. ml_tools/ML_sequence_inference.py +0 -8
  191. ml_tools/ML_sequence_models.py +0 -8
  192. ml_tools/ML_trainer.py +0 -12
  193. ml_tools/ML_vision_datasetmaster.py +0 -12
  194. ml_tools/ML_vision_evaluation.py +0 -10
  195. ml_tools/ML_vision_inference.py +0 -8
  196. ml_tools/ML_vision_models.py +0 -18
  197. ml_tools/SQL.py +0 -8
  198. ml_tools/_core/_ETL_cleaning.py +0 -694
  199. ml_tools/_core/_IO_tools.py +0 -498
  200. ml_tools/_core/_ML_callbacks.py +0 -702
  201. ml_tools/_core/_ML_configuration.py +0 -1332
  202. ml_tools/_core/_ML_configuration_pytab.py +0 -102
  203. ml_tools/_core/_ML_evaluation.py +0 -867
  204. ml_tools/_core/_ML_evaluation_multi.py +0 -544
  205. ml_tools/_core/_ML_inference.py +0 -646
  206. ml_tools/_core/_ML_models.py +0 -668
  207. ml_tools/_core/_ML_models_pytab.py +0 -693
  208. ml_tools/_core/_ML_trainer.py +0 -2323
  209. ml_tools/_core/_ML_utilities.py +0 -886
  210. ml_tools/_core/_ML_vision_models.py +0 -644
  211. ml_tools/_core/_data_exploration.py +0 -1909
  212. ml_tools/_core/_optimization_tools.py +0 -493
  213. ml_tools/_core/_schema.py +0 -359
  214. ml_tools/plot_fonts.py +0 -8
  215. ml_tools/schema.py +0 -12
  216. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
  217. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
  218. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  219. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
@@ -1,694 +0,0 @@
1
- import polars as pl
2
- from pathlib import Path
3
- from typing import Union, List, Dict, Optional
4
-
5
- from ._path_manager import sanitize_filename, make_fullpath
6
- from ._data_exploration import show_null_columns
7
- from ._utilities import save_dataframe_filename, load_dataframe
8
- from ._script_info import _script_info
9
- from ._logger import get_logger
10
-
11
-
12
- _LOGGER = get_logger("ETL Cleaning")
13
-
14
-
15
- __all__ = [
16
- "DragonColumnCleaner",
17
- "DragonDataFrameCleaner",
18
- "save_unique_values",
19
- "basic_clean",
20
- "basic_clean_drop",
21
- "drop_macro_polars",
22
- ]
23
-
24
-
25
- ################ Unique Values per column #################
26
- def save_unique_values(csv_path_or_df: Union[str, Path, pl.DataFrame],
27
- output_dir: Union[str, Path],
28
- use_columns: Optional[List[str]] = None,
29
- verbose: bool=False,
30
- keep_column_order: bool = True,
31
- add_value_separator: bool = False) -> None:
32
- """
33
- Loads a CSV file or Polars DataFrame, then analyzes it and saves the unique non-null values
34
- from each column into a separate text file exactly as they appear.
35
-
36
- This is useful for understanding the raw categories or range of values
37
- within a dataset before and after cleaning.
38
-
39
- Args:
40
- csv_path_or_df (str | Path | pl.DataFrame):
41
- The file path to the input CSV file or a Polars DataFrame.
42
- output_dir (str | Path):
43
- The path to the directory where the .txt files will be saved.
44
- The directory will be created if it does not exist.
45
- keep_column_order (bool):
46
- If True, prepends a numeric prefix to each
47
- output filename to maintain the original column order.
48
- add_value_separator (bool):
49
- If True, adds a separator line between each unique value.
50
- use_columns (List[str] | None):
51
- If provided, only these columns will be processed. If None, all columns will be processed.
52
- verbose (bool):
53
- If True, prints the number of unique values saved for each column.
54
- """
55
- # 1 Handle input DataFrame or path
56
- if isinstance(csv_path_or_df, pl.DataFrame):
57
- df = csv_path_or_df
58
- if use_columns is not None:
59
- # Validate columns exist
60
- valid_cols = [c for c in use_columns if c in df.columns]
61
- if not valid_cols:
62
- _LOGGER.error("None of the specified columns in 'use_columns' exist in the provided DataFrame.")
63
- raise ValueError()
64
- df = df.select(valid_cols)
65
- else:
66
- csv_path = make_fullpath(input_path=csv_path_or_df, enforce="file")
67
- df = load_dataframe(df_path=csv_path, use_columns=use_columns, kind="polars", all_strings=True)[0]
68
-
69
- output_dir = make_fullpath(input_path=output_dir, make=True, enforce='directory')
70
-
71
- if df.height == 0:
72
- _LOGGER.warning("The input DataFrame is empty. No unique values to save.")
73
- return
74
-
75
- # --- 2. Process Each Column ---
76
- counter = 0
77
-
78
- # Iterate over columns using Polars methods
79
- for i, column_name in enumerate(df.columns):
80
- try:
81
- col_expr = pl.col(column_name)
82
-
83
- # Check if the column is string-based (String or Utf8)
84
- dtype = df.schema[column_name]
85
- if dtype in (pl.String, pl.Utf8):
86
- # Filter out actual empty strings AND whitespace-only strings
87
- dataset = df.select(col_expr).filter(
88
- col_expr.str.strip_chars().str.len_chars() > 0
89
- )
90
- else:
91
- dataset = df.select(col_expr)
92
-
93
- # Efficiently get unique non-null values and sort them
94
- unique_series = dataset.drop_nulls().unique().sort(column_name)
95
-
96
- # Convert to a python list for writing
97
- sorted_uniques = unique_series.to_series().to_list()
98
-
99
- except Exception:
100
- _LOGGER.error(f"Could not process column '{column_name}'.")
101
- continue
102
-
103
- if not sorted_uniques:
104
- _LOGGER.warning(f"Column '{column_name}' has no unique non-null values. Skipping.")
105
- continue
106
-
107
- # --- 3. Filename Generation ---
108
- sanitized_name = sanitize_filename(column_name)
109
- if not sanitized_name.strip('_'):
110
- sanitized_name = f'column_{i}'
111
-
112
- prefix = f"{i + 1}_" if keep_column_order else ''
113
- file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
114
-
115
- # --- 4. Write to File ---
116
- try:
117
- with open(file_path, 'w', encoding='utf-8') as f:
118
- f.write(f"# Unique values for column: '{column_name}'\n")
119
- f.write(f"# Total unique non-null values: {len(sorted_uniques)}\n")
120
- f.write("-" * 30 + "\n")
121
-
122
- for value in sorted_uniques:
123
- f.write(f"{value}\n")
124
- if add_value_separator:
125
- f.write("-" * 30 + "\n")
126
-
127
- except IOError:
128
- _LOGGER.exception(f"Error writing to file {file_path}.")
129
- else:
130
- if verbose:
131
- print(f" Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
132
- counter += 1
133
-
134
- _LOGGER.info(f"{counter} files of unique values created.")
135
-
136
-
137
- ########## Basic df cleaners #############
138
- def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
139
- # Cleaning rules
140
- cleaning_rules = {
141
- # 1. Comprehensive Punctuation & Symbol Normalization
142
- # Remove invisible control characters
143
- r'\p{C}+': '',
144
-
145
- # Full-width to half-width
146
- # Numbers
147
- '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
148
- '5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
149
- # Superscripts & Subscripts
150
- '¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5',
151
- '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
152
- '₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
153
- '₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
154
- '⁺': '', '⁻': '', '₊': '', '₋': '',
155
- # Uppercase Alphabet
156
- 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
157
- 'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
158
- 'M': 'M', 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R',
159
- 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X',
160
- 'Y': 'Y', 'Z': 'Z',
161
- # Lowercase Alphabet
162
- 'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f',
163
- 'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l',
164
- 'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r',
165
- 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
166
- 'y': 'y', 'z': 'z',
167
- # Punctuation
168
- '》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']', '∼': '~',
169
- '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
170
- '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '', '⋅': '',
171
- '¯': '-', '_': '-',
172
-
173
- # Commas (avoid commas in entries)
174
- ',': ';',
175
- ',': ';',
176
- '、':';',
177
-
178
- # Others
179
- 'σ': '',
180
- '□': '',
181
- '©': '',
182
- '®': '',
183
- '™': '',
184
- r'[°˚]': '',
185
-
186
- # Replace special characters in entries
187
- r'\\': '_',
188
-
189
- # Typographical standardization
190
- # Unify various dashes and hyphens to a standard hyphen
191
- r'[—–―]': '-',
192
- r'−': '-',
193
- # remove various quote types
194
- r'[“”"]': '',
195
- r"[‘’′']": '',
196
-
197
- # Collapse repeating punctuation
198
- r'\.{2,}': '.', # Replace two or more dots with a single dot
199
- r'\?{2,}': '?', # Replace two or more question marks with a single question mark
200
- r'!{2,}': '!', # Replace two or more exclamation marks with a single one
201
- r';{2,}': ';',
202
- r'-{2,}': '-',
203
- r'/{2,}': '/',
204
- r'%{2,}': '%',
205
- r'&{2,}': '&',
206
-
207
- # 2. Internal Whitespace Consolidation
208
- # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
209
- r'\s+': ' ',
210
-
211
- # 3. Leading/Trailing Whitespace Removal
212
- # Strip any whitespace from the beginning or end of the string
213
- r'^\s+|\s+$': '',
214
-
215
- # 4. Textual Null Standardization (New Step)
216
- # Convert common null-like text to actual nulls.
217
- r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
218
-
219
- # 5. Final Nullification of Empty Strings
220
- # After all cleaning, if a string is now empty, convert it to a null
221
- r'^\s*$': None,
222
- r'^$': None,
223
- }
224
-
225
- # Clean data
226
- try:
227
- # Create a cleaner for every column in the dataframe
228
- all_columns = df_in.columns
229
- column_cleaners = [
230
- DragonColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
231
- ]
232
-
233
- # Instantiate and run the main dataframe cleaner
234
- df_cleaner = DragonDataFrameCleaner(cleaners=column_cleaners)
235
- df_cleaned = df_cleaner.clean(df_in)
236
-
237
- # apply lowercase to all string columns
238
- if all_lowercase:
239
- df_final = df_cleaned.with_columns(
240
- pl.col(pl.String).str.to_lowercase()
241
- )
242
- else:
243
- df_final = df_cleaned
244
-
245
- except Exception as e:
246
- _LOGGER.error(f"An error occurred during the cleaning process.")
247
- raise e
248
- else:
249
- return df_final
250
-
251
-
252
- def _local_path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
253
- # Handle paths
254
- input_path = make_fullpath(path_in, enforce="file")
255
-
256
- parent_dir = make_fullpath(Path(path_out).parent, make=True, enforce="directory")
257
- output_path = parent_dir / Path(path_out).name
258
-
259
- return input_path, output_path
260
-
261
-
262
- def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=False):
263
- """
264
- Performs a comprehensive, standardized cleaning on all columns of a CSV file.
265
-
266
- The cleaning process includes:
267
- - Normalizing full-width and typographical punctuation to standard equivalents.
268
- - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
269
- - Stripping any leading or trailing whitespace.
270
- - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
271
- - Converting strings that become empty after cleaning into true null values.
272
- - Normalizing all text to lowercase (Optional).
273
-
274
- Args:
275
- input_filepath (str | Path):
276
- The path to the source CSV file to be cleaned.
277
- output_filepath (str | Path):
278
- The path to save the cleaned CSV file.
279
- all_lowercase (bool):
280
- Whether to normalize all text to lowercase.
281
-
282
- """
283
- # Handle paths
284
- input_path, output_path = _local_path_manager(path_in=input_filepath, path_out=output_filepath)
285
-
286
- # load polars df
287
- df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
288
-
289
- # CLEAN
290
- df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
291
-
292
- # Save cleaned dataframe
293
- save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
294
-
295
- _LOGGER.info(f"Data successfully cleaned.")
296
-
297
-
298
- def basic_clean_drop(input_filepath: Union[str,Path],
299
- output_filepath: Union[str,Path],
300
- log_directory: Union[str,Path],
301
- targets: list[str],
302
- skip_targets: bool=False,
303
- threshold: float=0.8,
304
- all_lowercase: bool=False):
305
- """
306
- Performs standardized cleaning followed by iterative removal of rows and
307
- columns with excessive missing data.
308
-
309
- This function combines the functionality of `basic_clean` and `drop_macro_polars`. It first
310
- applies a comprehensive normalization process to all columns in the input CSV file.
311
- Then it applies iterative row and column dropping to remove redundant or incomplete data.
312
-
313
- Args:
314
- input_filepath (str | Path):
315
- The path to the source CSV file to be cleaned.
316
- output_filepath (str | Path):
317
- The path to save the fully cleaned CSV file after cleaning
318
- and missing-data-based pruning.
319
- log_directory (str | Path):
320
- Path to the directory where missing data reports will be stored.
321
- targets (list[str]):
322
- A list of column names to be treated as target variables.
323
- This list guides the row-dropping logic.
324
- skip_targets (bool):
325
- If True, the columns listed in `targets` will be exempt from being dropped,
326
- even if they exceed the missing data threshold.
327
- threshold (float):
328
- The proportion of missing data required to drop a row or column.
329
- For example, 0.8 means a row/column will be dropped if 80% or more
330
- of its data is missing.
331
- all_lowercase (bool):
332
- Whether to normalize all text to lowercase.
333
- """
334
- # handle log path
335
- log_path = make_fullpath(log_directory, make=True, enforce="directory")
336
-
337
- # Handle df paths
338
- input_path, output_path = _local_path_manager(path_in=input_filepath, path_out=output_filepath)
339
-
340
- # load polars df
341
- df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
342
-
343
- # CLEAN
344
- df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
345
-
346
- # Drop macro (Polars implementation)
347
- df_final = drop_macro_polars(df=df_cleaned,
348
- log_directory=log_path,
349
- targets=targets,
350
- skip_targets=skip_targets,
351
- threshold=threshold)
352
-
353
- # Save cleaned dataframe
354
- save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
355
-
356
- _LOGGER.info(f"Data successfully cleaned.")
357
-
358
-
359
- ########## EXTRACT and CLEAN ##########
360
- class DragonColumnCleaner:
361
- """
362
- A configuration object that defines cleaning rules for a single Polars DataFrame column.
363
-
364
- This class holds a dictionary of regex-to-replacement rules, the target column name,
365
- and the case-sensitivity setting. It is intended to be used with the DragonDataFrameCleaner.
366
-
367
- Notes:
368
- - Define rules from most specific to more general to create a fallback system.
369
- - Beware of chain replacements (rules matching strings that have already been
370
- changed by a previous rule in the same cleaner).
371
- """
372
- def __init__(self,
373
- column_name: str,
374
- rules: Union[Dict[str, Union[str, None]], Dict[str, str]],
375
- case_insensitive: bool = False):
376
- """
377
- Args:
378
- column_name (str):
379
- The name of the column to be cleaned.
380
- rules (Dict[str, str | None]):
381
- A dictionary of regex patterns to replacement strings.
382
- - Replacement can be None to indicate that matching values should be converted to null.
383
- - Can use backreferences (e.g., r'$1 $2') for captured groups. Note that Polars uses a '$' prefix for backreferences.
384
- case_insensitive (bool):
385
- If True, regex matching ignores case.
386
-
387
- ## Usage Example
388
-
389
- ```python
390
- id_rules = {
391
- # Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
392
- r'ID[- ](\\d+)': r'ID:$1'
393
- }
394
-
395
- id_cleaner = DragonColumnCleaner(column_name='user_id', rules=id_rules)
396
- # This object would then be passed to a DragonDataFrameCleaner.
397
- ```
398
- """
399
- if not isinstance(column_name, str) or not column_name:
400
- _LOGGER.error("The 'column_name' must be a non-empty string.")
401
- raise TypeError()
402
- if not isinstance(rules, dict):
403
- _LOGGER.error("The 'rules' argument must be a dictionary.")
404
- raise TypeError()
405
- # validate rules
406
- for pattern, replacement in rules.items():
407
- if not isinstance(pattern, str):
408
- _LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
409
- raise TypeError()
410
- if replacement is not None and not isinstance(replacement, str):
411
- _LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
412
- raise TypeError()
413
-
414
- self.column_name = column_name
415
- self.rules = rules
416
- self.case_insensitive = case_insensitive
417
-
418
- def preview(self,
419
- csv_path: Union[str, Path],
420
- report_dir: Union[str, Path],
421
- add_value_separator: bool=False,
422
- rule_batch_size: int = 150):
423
- """
424
- Generates a preview report of unique values in the specified column after applying the current cleaning rules.
425
-
426
- Args:
427
- csv_path (str | Path):
428
- The path to the CSV file containing the data to clean.
429
- report_dir (str | Path):
430
- The directory where the preview report will be saved.
431
- add_value_separator (bool):
432
- If True, adds a separator line between each unique value in the report.
433
- rule_batch_size (int):
434
- Splits the regex rules into chunks of this size. Helps prevent memory errors.
435
- """
436
- # Load DataFrame
437
- df, _ = load_dataframe(df_path=csv_path, use_columns=[self.column_name], kind="polars", all_strings=True)
438
-
439
- preview_cleaner = DragonDataFrameCleaner(cleaners=[self])
440
- df_preview = preview_cleaner.clean(df, rule_batch_size=rule_batch_size)
441
-
442
- # Apply cleaning rules to a copy of the column for preview
443
- save_unique_values(csv_path_or_df=df_preview,
444
- output_dir=report_dir,
445
- use_columns=[self.column_name],
446
- verbose=False,
447
- keep_column_order=False,
448
- add_value_separator=add_value_separator)
449
-
450
-
451
- class DragonDataFrameCleaner:
452
- """
453
- Orchestrates cleaning multiple columns in a Polars DataFrame.
454
- """
455
- def __init__(self, cleaners: List[DragonColumnCleaner]):
456
- """
457
- Takes a list of DragonColumnCleaner objects and applies their defined
458
- rules to the corresponding columns of a DataFrame using high-performance
459
- Polars expressions wit memory optimization.
460
-
461
- Args:
462
- cleaners (List[DragonColumnCleaner]):
463
- A list of DragonColumnCleaner configuration objects.
464
- """
465
- if not isinstance(cleaners, list):
466
- _LOGGER.error("The 'cleaners' argument must be a list of DragonColumnCleaner objects.")
467
- raise TypeError()
468
-
469
- seen_columns = set()
470
- for cleaner in cleaners:
471
- if not isinstance(cleaner, DragonColumnCleaner):
472
- _LOGGER.error(f"All items in 'cleaners' list must be DragonColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
473
- raise TypeError()
474
- if cleaner.column_name in seen_columns:
475
- _LOGGER.error(f"Duplicate DragonColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
476
- raise ValueError()
477
- seen_columns.add(cleaner.column_name)
478
-
479
- self.cleaners = cleaners
480
-
481
- def clean(self, df: Union[pl.DataFrame, pl.LazyFrame],
482
- rule_batch_size: int = 150) -> pl.DataFrame:
483
- """
484
- Applies cleaning rules. Supports Lazy execution to handle OOM issues.
485
-
486
- Args:
487
- df (pl.DataFrame | pl.LazyFrame):
488
- The data to clean.
489
- rule_batch_size (int):
490
- Splits the regex rules into chunks of this size. Helps prevent memory errors.
491
-
492
- Returns:
493
- pl.DataFrame: The cleaned, collected DataFrame.
494
- """
495
- # 1. Validate Columns (Only if eager, or simple schema check if lazy)
496
- # Note: For LazyFrames, we assume columns exist or let it fail at collection.
497
- if isinstance(df, pl.DataFrame):
498
- df_cols = set(df.columns)
499
- rule_cols = {c.column_name for c in self.cleaners}
500
- missing = rule_cols - df_cols
501
- if missing:
502
- _LOGGER.error(f"The following columns specified in cleaners are missing from the DataFrame: {missing}")
503
- raise ValueError()
504
-
505
-
506
- # lazy internally
507
- lf = df.lazy()
508
- else:
509
- # It should be a LazyFrame, check type
510
- if not isinstance(df, pl.LazyFrame):
511
- _LOGGER.error("The 'df' argument must be a Polars DataFrame or LazyFrame.")
512
- raise TypeError()
513
- # It is already a LazyFrame
514
- lf = df
515
-
516
- # 2. Build Expression Chain
517
- final_lf = lf
518
-
519
- for cleaner in self.cleaners:
520
- col_name = cleaner.column_name
521
-
522
- # Get all rules as a list of items
523
- all_rules = list(cleaner.rules.items())
524
-
525
- # Process in batches of 'rule_batch_size'
526
- for i in range(0, len(all_rules), rule_batch_size):
527
- rule_batch = all_rules[i : i + rule_batch_size]
528
-
529
- # Start expression for this batch
530
- col_expr = pl.col(col_name).cast(pl.String)
531
-
532
- for pattern, replacement in rule_batch:
533
- final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
534
-
535
- if replacement is None:
536
- col_expr = pl.when(col_expr.str.contains(final_pattern)) \
537
- .then(None) \
538
- .otherwise(col_expr)
539
- else:
540
- col_expr = col_expr.str.replace_all(final_pattern, replacement)
541
-
542
- # Apply this batch of rules to the LazyFrame
543
- final_lf = final_lf.with_columns(col_expr.alias(col_name))
544
-
545
- # 3. Collect Results
546
- try:
547
- return final_lf.collect(engine="streaming")
548
- except Exception as e:
549
- _LOGGER.error("An error occurred during the cleaning process.")
550
- raise e
551
-
552
- def load_clean_save(self,
553
- input_filepath: Union[str,Path],
554
- output_filepath: Union[str,Path],
555
- rule_batch_size: int = 150):
556
- """
557
- This convenience method encapsulates the entire cleaning process into a
558
- single call. It loads a DataFrame from a specified file, applies all
559
- cleaning rules configured in the `DragonDataFrameCleaner` instance, and saves
560
- the resulting cleaned DataFrame to a new file.
561
-
562
- The method ensures that all data is loaded as string types to prevent
563
- unintended type inference issues before cleaning operations are applied.
564
-
565
- Args:
566
- input_filepath (Union[str, Path]):
567
- The path to the input data file.
568
- output_filepath (Union[str, Path]):
569
- The full path, where the cleaned data file will be saved.
570
- rule_batch_size (int):
571
- Splits the regex rules into chunks of this size. Helps prevent memory errors.
572
- """
573
- df, _ = load_dataframe(df_path=input_filepath, kind="polars", all_strings=True)
574
-
575
- df_clean = self.clean(df=df, rule_batch_size=rule_batch_size)
576
-
577
- if isinstance(output_filepath, str):
578
- output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
579
-
580
- save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
581
-
582
- return None
583
-
584
-
585
- def _generate_null_report(df: pl.DataFrame, save_dir: Path, filename: str):
586
- """
587
- Internal helper to generate and save a CSV report of missing data percentages using Polars.
588
- """
589
- total_rows = df.height
590
- if total_rows == 0:
591
- return
592
-
593
- null_stats = df.null_count()
594
-
595
- # Construct a report DataFrame
596
- report = pl.DataFrame({
597
- "column": df.columns,
598
- "null_count": null_stats.transpose().to_series(),
599
- }).with_columns(
600
- (pl.col("null_count") / total_rows * 100).round(2).alias("missing_percent")
601
- ).sort("missing_percent", descending=True)
602
-
603
- save_dataframe_filename(df=report, save_dir=save_dir, filename=filename)
604
-
605
-
606
- def drop_macro_polars(df: pl.DataFrame,
607
- log_directory: Path,
608
- targets: list[str],
609
- skip_targets: bool,
610
- threshold: float) -> pl.DataFrame:
611
- """
612
- High-performance implementation of iterative row/column pruning using Polars.
613
- Includes temporary Pandas conversion for visualization.
614
- """
615
- df_clean = df.clone()
616
-
617
- # --- Helper to generate plot safely ---
618
- def _plot_safe(df_pl: pl.DataFrame, filename: str):
619
- try:
620
- # converting to pandas just for the plot
621
- # use_pyarrow_extension_array=True is faster
622
- df_pd = df_pl.to_pandas(use_pyarrow_extension_array=True)
623
- show_null_columns(df_pd, plot_to_dir=log_directory, plot_filename=filename, use_all_columns=True)
624
- except Exception as e:
625
- _LOGGER.warning(f"Skipping plot generation due to error: {e}")
626
-
627
- # 1. Log Initial State
628
- _generate_null_report(df_clean, log_directory, "Missing_Data_Original")
629
- _plot_safe(df_clean, "Original")
630
-
631
- master = True
632
- while master:
633
- initial_rows, initial_cols = df_clean.shape
634
-
635
- # --- A. Drop Constant Columns ---
636
- # Keep columns where n_unique > 1.
637
- # Note: n_unique in Polars ignores nulls by default (similar to pandas dropna=True).
638
- # We assume if a column is all nulls, it should also be dropped (n_unique=0).
639
- cols_to_keep = [
640
- col for col in df_clean.columns
641
- if df_clean[col].n_unique() > 1
642
- ]
643
- df_clean = df_clean.select(cols_to_keep)
644
-
645
- # --- B. Drop Rows (Targets) ---
646
- # Drop rows where ALL target columns are null
647
- valid_targets = [t for t in targets if t in df_clean.columns]
648
- if valid_targets:
649
- df_clean = df_clean.filter(
650
- ~pl.all_horizontal(pl.col(valid_targets).is_null())
651
- )
652
-
653
- # --- C. Drop Rows (Features Threshold) ---
654
- # Drop rows where missing data fraction in FEATURE columns > threshold
655
- feature_cols = [c for c in df_clean.columns if c not in valid_targets]
656
- if feature_cols:
657
- # We want to KEEP rows where (null_count / total_features) <= threshold
658
- df_clean = df_clean.filter(
659
- (pl.sum_horizontal(pl.col(feature_cols).is_null()) / len(feature_cols)) <= threshold
660
- )
661
-
662
- # --- D. Drop Columns (Threshold) ---
663
- # Drop columns where missing data fraction > threshold
664
- current_height = df_clean.height
665
- if current_height > 0:
666
- null_counts = df_clean.null_count().row(0) # tuple of counts
667
- cols_to_drop = []
668
-
669
- for col_idx, col_name in enumerate(df_clean.columns):
670
- # Check if we should skip this column (if it's a target and skip_targets=True)
671
- if skip_targets and col_name in valid_targets:
672
- continue
673
-
674
- missing_frac = null_counts[col_idx] / current_height
675
- if missing_frac > threshold:
676
- cols_to_drop.append(col_name)
677
-
678
- if cols_to_drop:
679
- df_clean = df_clean.drop(cols_to_drop)
680
-
681
- # --- E. Check Convergence ---
682
- remaining_rows, remaining_cols = df_clean.shape
683
- if remaining_rows >= initial_rows and remaining_cols >= initial_cols:
684
- master = False
685
-
686
- # 2. Log Final State
687
- _generate_null_report(df_clean, log_directory, "Missing_Data_Processed")
688
- _plot_safe(df_clean, "Processed")
689
-
690
- return df_clean
691
-
692
-
693
- def info():
694
- _script_info(__all__)