dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
- dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
- ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
- ml_tools/ETL_cleaning/_basic_clean.py +351 -0
- ml_tools/ETL_cleaning/_clean_tools.py +128 -0
- ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
- ml_tools/ETL_cleaning/_imprimir.py +13 -0
- ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
- ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
- ml_tools/ETL_engineering/_imprimir.py +24 -0
- ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
- ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
- ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
- ml_tools/GUI_tools/_imprimir.py +12 -0
- ml_tools/IO_tools/_IO_loggers.py +235 -0
- ml_tools/IO_tools/_IO_save_load.py +151 -0
- ml_tools/IO_tools/_IO_utils.py +140 -0
- ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
- ml_tools/IO_tools/_imprimir.py +14 -0
- ml_tools/MICE/_MICE_imputation.py +132 -0
- ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
- ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
- ml_tools/MICE/_imprimir.py +11 -0
- ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
- ml_tools/ML_callbacks/_base.py +101 -0
- ml_tools/ML_callbacks/_checkpoint.py +232 -0
- ml_tools/ML_callbacks/_early_stop.py +208 -0
- ml_tools/ML_callbacks/_imprimir.py +12 -0
- ml_tools/ML_callbacks/_scheduler.py +197 -0
- ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
- ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
- ml_tools/ML_chain/_dragon_chain.py +140 -0
- ml_tools/ML_chain/_imprimir.py +11 -0
- ml_tools/ML_configuration/__init__.py +90 -0
- ml_tools/ML_configuration/_base_model_config.py +69 -0
- ml_tools/ML_configuration/_finalize.py +366 -0
- ml_tools/ML_configuration/_imprimir.py +47 -0
- ml_tools/ML_configuration/_metrics.py +593 -0
- ml_tools/ML_configuration/_models.py +206 -0
- ml_tools/ML_configuration/_training.py +124 -0
- ml_tools/ML_datasetmaster/__init__.py +28 -0
- ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
- ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
- ml_tools/ML_datasetmaster/_imprimir.py +15 -0
- ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
- ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
- ml_tools/ML_evaluation/__init__.py +53 -0
- ml_tools/ML_evaluation/_classification.py +629 -0
- ml_tools/ML_evaluation/_feature_importance.py +409 -0
- ml_tools/ML_evaluation/_imprimir.py +25 -0
- ml_tools/ML_evaluation/_loss.py +92 -0
- ml_tools/ML_evaluation/_regression.py +273 -0
- ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
- ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
- ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
- ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
- ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
- ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
- ml_tools/ML_finalize_handler/__init__.py +10 -0
- ml_tools/ML_finalize_handler/_imprimir.py +8 -0
- ml_tools/ML_inference/__init__.py +22 -0
- ml_tools/ML_inference/_base_inference.py +166 -0
- ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
- ml_tools/ML_inference/_dragon_inference.py +332 -0
- ml_tools/ML_inference/_imprimir.py +11 -0
- ml_tools/ML_inference/_multi_inference.py +180 -0
- ml_tools/ML_inference_sequence/__init__.py +10 -0
- ml_tools/ML_inference_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
- ml_tools/ML_inference_vision/__init__.py +10 -0
- ml_tools/ML_inference_vision/_imprimir.py +8 -0
- ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
- ml_tools/ML_models/__init__.py +32 -0
- ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
- ml_tools/ML_models/_base_mlp_attention.py +198 -0
- ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
- ml_tools/ML_models/_dragon_tabular.py +248 -0
- ml_tools/ML_models/_imprimir.py +18 -0
- ml_tools/ML_models/_mlp_attention.py +134 -0
- ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
- ml_tools/ML_models_sequence/__init__.py +10 -0
- ml_tools/ML_models_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
- ml_tools/ML_models_vision/__init__.py +29 -0
- ml_tools/ML_models_vision/_base_wrapper.py +254 -0
- ml_tools/ML_models_vision/_image_classification.py +182 -0
- ml_tools/ML_models_vision/_image_segmentation.py +108 -0
- ml_tools/ML_models_vision/_imprimir.py +16 -0
- ml_tools/ML_models_vision/_object_detection.py +135 -0
- ml_tools/ML_optimization/__init__.py +21 -0
- ml_tools/ML_optimization/_imprimir.py +13 -0
- ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
- ml_tools/ML_optimization/_single_dragon.py +203 -0
- ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
- ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
- ml_tools/ML_scaler/__init__.py +10 -0
- ml_tools/ML_scaler/_imprimir.py +8 -0
- ml_tools/ML_trainer/__init__.py +20 -0
- ml_tools/ML_trainer/_base_trainer.py +297 -0
- ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
- ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
- ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
- ml_tools/ML_trainer/_imprimir.py +10 -0
- ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
- ml_tools/ML_utilities/_artifact_finder.py +382 -0
- ml_tools/ML_utilities/_imprimir.py +16 -0
- ml_tools/ML_utilities/_inspection.py +325 -0
- ml_tools/ML_utilities/_train_tools.py +205 -0
- ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
- ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
- ml_tools/ML_vision_transformers/_imprimir.py +14 -0
- ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
- ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
- ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
- ml_tools/PSO_optimization/_imprimir.py +10 -0
- ml_tools/SQL/__init__.py +7 -0
- ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
- ml_tools/SQL/_imprimir.py +8 -0
- ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
- ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
- ml_tools/VIF/_imprimir.py +10 -0
- ml_tools/_core/__init__.py +7 -1
- ml_tools/_core/_logger.py +8 -18
- ml_tools/_core/_schema_load_ops.py +43 -0
- ml_tools/_core/_script_info.py +2 -2
- ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
- ml_tools/data_exploration/_analysis.py +214 -0
- ml_tools/data_exploration/_cleaning.py +566 -0
- ml_tools/data_exploration/_features.py +583 -0
- ml_tools/data_exploration/_imprimir.py +32 -0
- ml_tools/data_exploration/_plotting.py +487 -0
- ml_tools/data_exploration/_schema_ops.py +176 -0
- ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
- ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
- ml_tools/ensemble_evaluation/_imprimir.py +14 -0
- ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
- ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
- ml_tools/ensemble_inference/_imprimir.py +9 -0
- ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
- ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
- ml_tools/ensemble_learning/_imprimir.py +10 -0
- ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
- ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
- ml_tools/excel_handler/_imprimir.py +13 -0
- ml_tools/{keys.py → keys/__init__.py} +4 -1
- ml_tools/keys/_imprimir.py +11 -0
- ml_tools/{_core → keys}/_keys.py +2 -0
- ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
- ml_tools/math_utilities/_imprimir.py +11 -0
- ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
- ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
- ml_tools/optimization_tools/_imprimir.py +13 -0
- ml_tools/optimization_tools/_optimization_bounds.py +236 -0
- ml_tools/optimization_tools/_optimization_plots.py +218 -0
- ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
- ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
- ml_tools/path_manager/_imprimir.py +15 -0
- ml_tools/path_manager/_path_tools.py +346 -0
- ml_tools/plot_fonts/__init__.py +8 -0
- ml_tools/plot_fonts/_imprimir.py +8 -0
- ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
- ml_tools/schema/__init__.py +15 -0
- ml_tools/schema/_feature_schema.py +223 -0
- ml_tools/schema/_gui_schema.py +191 -0
- ml_tools/schema/_imprimir.py +10 -0
- ml_tools/{serde.py → serde/__init__.py} +4 -2
- ml_tools/serde/_imprimir.py +10 -0
- ml_tools/{_core → serde}/_serde.py +3 -8
- ml_tools/{utilities.py → utilities/__init__.py} +11 -6
- ml_tools/utilities/_imprimir.py +18 -0
- ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
- ml_tools/utilities/_utility_tools.py +192 -0
- dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
- ml_tools/ML_chaining_inference.py +0 -8
- ml_tools/ML_configuration.py +0 -86
- ml_tools/ML_configuration_pytab.py +0 -14
- ml_tools/ML_datasetmaster.py +0 -10
- ml_tools/ML_evaluation.py +0 -16
- ml_tools/ML_evaluation_multi.py +0 -12
- ml_tools/ML_finalize_handler.py +0 -8
- ml_tools/ML_inference.py +0 -12
- ml_tools/ML_models.py +0 -14
- ml_tools/ML_models_advanced.py +0 -14
- ml_tools/ML_models_pytab.py +0 -14
- ml_tools/ML_optimization.py +0 -14
- ml_tools/ML_optimization_pareto.py +0 -8
- ml_tools/ML_scaler.py +0 -8
- ml_tools/ML_sequence_datasetmaster.py +0 -8
- ml_tools/ML_sequence_evaluation.py +0 -10
- ml_tools/ML_sequence_inference.py +0 -8
- ml_tools/ML_sequence_models.py +0 -8
- ml_tools/ML_trainer.py +0 -12
- ml_tools/ML_vision_datasetmaster.py +0 -12
- ml_tools/ML_vision_evaluation.py +0 -10
- ml_tools/ML_vision_inference.py +0 -8
- ml_tools/ML_vision_models.py +0 -18
- ml_tools/SQL.py +0 -8
- ml_tools/_core/_ETL_cleaning.py +0 -694
- ml_tools/_core/_IO_tools.py +0 -498
- ml_tools/_core/_ML_callbacks.py +0 -702
- ml_tools/_core/_ML_configuration.py +0 -1332
- ml_tools/_core/_ML_configuration_pytab.py +0 -102
- ml_tools/_core/_ML_evaluation.py +0 -867
- ml_tools/_core/_ML_evaluation_multi.py +0 -544
- ml_tools/_core/_ML_inference.py +0 -646
- ml_tools/_core/_ML_models.py +0 -668
- ml_tools/_core/_ML_models_pytab.py +0 -693
- ml_tools/_core/_ML_trainer.py +0 -2323
- ml_tools/_core/_ML_utilities.py +0 -886
- ml_tools/_core/_ML_vision_models.py +0 -644
- ml_tools/_core/_data_exploration.py +0 -1909
- ml_tools/_core/_optimization_tools.py +0 -493
- ml_tools/_core/_schema.py +0 -359
- ml_tools/plot_fonts.py +0 -8
- ml_tools/schema.py +0 -12
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
from ..data_exploration import show_null_columns
|
|
6
|
+
from ..utilities import save_dataframe_filename, load_dataframe
|
|
7
|
+
|
|
8
|
+
from ..path_manager import make_fullpath
|
|
9
|
+
from .._core import get_logger
|
|
10
|
+
|
|
11
|
+
from ._dragon_cleaner import DragonColumnCleaner, DragonDataFrameCleaner
|
|
12
|
+
|
|
13
|
+
_LOGGER = get_logger("ETL Basic Clean")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"basic_clean",
|
|
18
|
+
"basic_clean_drop",
|
|
19
|
+
"drop_macro_polars",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
########## Basic cleaners #############
|
|
24
|
+
def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
|
|
25
|
+
# Cleaning rules
|
|
26
|
+
cleaning_rules = {
|
|
27
|
+
# 1. Comprehensive Punctuation & Symbol Normalization
|
|
28
|
+
# Remove invisible control characters
|
|
29
|
+
r'\p{C}+': '',
|
|
30
|
+
|
|
31
|
+
# Full-width to half-width
|
|
32
|
+
# Numbers
|
|
33
|
+
'0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
|
|
34
|
+
'5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
|
|
35
|
+
# Superscripts & Subscripts
|
|
36
|
+
'¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5',
|
|
37
|
+
'⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
|
|
38
|
+
'₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
|
|
39
|
+
'₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
|
|
40
|
+
'⁺': '', '⁻': '', '₊': '', '₋': '',
|
|
41
|
+
# Uppercase Alphabet
|
|
42
|
+
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
|
|
43
|
+
'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
|
|
44
|
+
'M': 'M', 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R',
|
|
45
|
+
'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X',
|
|
46
|
+
'Y': 'Y', 'Z': 'Z',
|
|
47
|
+
# Lowercase Alphabet
|
|
48
|
+
'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f',
|
|
49
|
+
'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l',
|
|
50
|
+
'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r',
|
|
51
|
+
's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
|
|
52
|
+
'y': 'y', 'z': 'z',
|
|
53
|
+
# Punctuation
|
|
54
|
+
'》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']', '∼': '~',
|
|
55
|
+
'(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
|
|
56
|
+
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '', '⋅': '',
|
|
57
|
+
'¯': '-', '_': '-',
|
|
58
|
+
|
|
59
|
+
# Commas (avoid commas in entries)
|
|
60
|
+
',': ';',
|
|
61
|
+
',': ';',
|
|
62
|
+
'、':';',
|
|
63
|
+
|
|
64
|
+
# Others
|
|
65
|
+
'σ': '',
|
|
66
|
+
'□': '',
|
|
67
|
+
'©': '',
|
|
68
|
+
'®': '',
|
|
69
|
+
'™': '',
|
|
70
|
+
r'[°˚]': '',
|
|
71
|
+
|
|
72
|
+
# Replace special characters in entries
|
|
73
|
+
r'\\': '_',
|
|
74
|
+
|
|
75
|
+
# Typographical standardization
|
|
76
|
+
# Unify various dashes and hyphens to a standard hyphen
|
|
77
|
+
r'[—–―]': '-',
|
|
78
|
+
r'−': '-',
|
|
79
|
+
# remove various quote types
|
|
80
|
+
r'[“”"]': '',
|
|
81
|
+
r"[‘’′']": '',
|
|
82
|
+
|
|
83
|
+
# Collapse repeating punctuation
|
|
84
|
+
r'\.{2,}': '.', # Replace two or more dots with a single dot
|
|
85
|
+
r'\?{2,}': '?', # Replace two or more question marks with a single question mark
|
|
86
|
+
r'!{2,}': '!', # Replace two or more exclamation marks with a single one
|
|
87
|
+
r';{2,}': ';',
|
|
88
|
+
r'-{2,}': '-',
|
|
89
|
+
r'/{2,}': '/',
|
|
90
|
+
r'%{2,}': '%',
|
|
91
|
+
r'&{2,}': '&',
|
|
92
|
+
|
|
93
|
+
# 2. Internal Whitespace Consolidation
|
|
94
|
+
# Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
|
|
95
|
+
r'\s+': ' ',
|
|
96
|
+
|
|
97
|
+
# 3. Leading/Trailing Whitespace Removal
|
|
98
|
+
# Strip any whitespace from the beginning or end of the string
|
|
99
|
+
r'^\s+|\s+$': '',
|
|
100
|
+
|
|
101
|
+
# 4. Textual Null Standardization (New Step)
|
|
102
|
+
# Convert common null-like text to actual nulls.
|
|
103
|
+
r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
|
|
104
|
+
|
|
105
|
+
# 5. Final Nullification of Empty Strings
|
|
106
|
+
# After all cleaning, if a string is now empty, convert it to a null
|
|
107
|
+
r'^\s*$': None,
|
|
108
|
+
r'^$': None,
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Clean data
|
|
112
|
+
try:
|
|
113
|
+
# Create a cleaner for every column in the dataframe
|
|
114
|
+
all_columns = df_in.columns
|
|
115
|
+
column_cleaners = [
|
|
116
|
+
DragonColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
# Instantiate and run the main dataframe cleaner
|
|
120
|
+
df_cleaner = DragonDataFrameCleaner(cleaners=column_cleaners)
|
|
121
|
+
df_cleaned = df_cleaner.clean(df_in)
|
|
122
|
+
|
|
123
|
+
# apply lowercase to all string columns
|
|
124
|
+
if all_lowercase:
|
|
125
|
+
df_final = df_cleaned.with_columns(
|
|
126
|
+
pl.col(pl.String).str.to_lowercase()
|
|
127
|
+
)
|
|
128
|
+
else:
|
|
129
|
+
df_final = df_cleaned
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
_LOGGER.error(f"An error occurred during the cleaning process.")
|
|
133
|
+
raise e
|
|
134
|
+
else:
|
|
135
|
+
return df_final
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _local_path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
|
|
139
|
+
# Handle paths
|
|
140
|
+
input_path = make_fullpath(path_in, enforce="file")
|
|
141
|
+
|
|
142
|
+
parent_dir = make_fullpath(Path(path_out).parent, make=True, enforce="directory")
|
|
143
|
+
output_path = parent_dir / Path(path_out).name
|
|
144
|
+
|
|
145
|
+
return input_path, output_path
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=False):
|
|
149
|
+
"""
|
|
150
|
+
Performs a comprehensive, standardized cleaning on all columns of a CSV file.
|
|
151
|
+
|
|
152
|
+
The cleaning process includes:
|
|
153
|
+
- Normalizing full-width and typographical punctuation to standard equivalents.
|
|
154
|
+
- Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
|
|
155
|
+
- Stripping any leading or trailing whitespace.
|
|
156
|
+
- Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
|
|
157
|
+
- Converting strings that become empty after cleaning into true null values.
|
|
158
|
+
- Normalizing all text to lowercase (Optional).
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
input_filepath (str | Path):
|
|
162
|
+
The path to the source CSV file to be cleaned.
|
|
163
|
+
output_filepath (str | Path):
|
|
164
|
+
The path to save the cleaned CSV file.
|
|
165
|
+
all_lowercase (bool):
|
|
166
|
+
Whether to normalize all text to lowercase.
|
|
167
|
+
|
|
168
|
+
"""
|
|
169
|
+
# Handle paths
|
|
170
|
+
input_path, output_path = _local_path_manager(path_in=input_filepath, path_out=output_filepath)
|
|
171
|
+
|
|
172
|
+
# load polars df
|
|
173
|
+
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
174
|
+
|
|
175
|
+
# CLEAN
|
|
176
|
+
df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
|
|
177
|
+
|
|
178
|
+
# Save cleaned dataframe
|
|
179
|
+
save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
180
|
+
|
|
181
|
+
_LOGGER.info(f"Data successfully cleaned.")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def basic_clean_drop(input_filepath: Union[str,Path],
|
|
185
|
+
output_filepath: Union[str,Path],
|
|
186
|
+
log_directory: Union[str,Path],
|
|
187
|
+
targets: list[str],
|
|
188
|
+
skip_targets: bool=False,
|
|
189
|
+
threshold: float=0.8,
|
|
190
|
+
all_lowercase: bool=False):
|
|
191
|
+
"""
|
|
192
|
+
Performs standardized cleaning followed by iterative removal of rows and
|
|
193
|
+
columns with excessive missing data.
|
|
194
|
+
|
|
195
|
+
This function combines the functionality of `basic_clean` and `drop_macro_polars`. It first
|
|
196
|
+
applies a comprehensive normalization process to all columns in the input CSV file.
|
|
197
|
+
Then it applies iterative row and column dropping to remove redundant or incomplete data.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
input_filepath (str | Path):
|
|
201
|
+
The path to the source CSV file to be cleaned.
|
|
202
|
+
output_filepath (str | Path):
|
|
203
|
+
The path to save the fully cleaned CSV file after cleaning
|
|
204
|
+
and missing-data-based pruning.
|
|
205
|
+
log_directory (str | Path):
|
|
206
|
+
Path to the directory where missing data reports will be stored.
|
|
207
|
+
targets (list[str]):
|
|
208
|
+
A list of column names to be treated as target variables.
|
|
209
|
+
This list guides the row-dropping logic.
|
|
210
|
+
skip_targets (bool):
|
|
211
|
+
If True, the columns listed in `targets` will be exempt from being dropped,
|
|
212
|
+
even if they exceed the missing data threshold.
|
|
213
|
+
threshold (float):
|
|
214
|
+
The proportion of missing data required to drop a row or column.
|
|
215
|
+
For example, 0.8 means a row/column will be dropped if 80% or more
|
|
216
|
+
of its data is missing.
|
|
217
|
+
all_lowercase (bool):
|
|
218
|
+
Whether to normalize all text to lowercase.
|
|
219
|
+
"""
|
|
220
|
+
# handle log path
|
|
221
|
+
log_path = make_fullpath(log_directory, make=True, enforce="directory")
|
|
222
|
+
|
|
223
|
+
# Handle df paths
|
|
224
|
+
input_path, output_path = _local_path_manager(path_in=input_filepath, path_out=output_filepath)
|
|
225
|
+
|
|
226
|
+
# load polars df
|
|
227
|
+
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
228
|
+
|
|
229
|
+
# CLEAN
|
|
230
|
+
df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
|
|
231
|
+
|
|
232
|
+
# Drop macro (Polars implementation)
|
|
233
|
+
df_final = drop_macro_polars(df=df_cleaned,
|
|
234
|
+
log_directory=log_path,
|
|
235
|
+
targets=targets,
|
|
236
|
+
skip_targets=skip_targets,
|
|
237
|
+
threshold=threshold)
|
|
238
|
+
|
|
239
|
+
# Save cleaned dataframe
|
|
240
|
+
save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
241
|
+
|
|
242
|
+
_LOGGER.info(f"Data successfully cleaned.")
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
########## EXTRACT and CLEAN ##########
|
|
246
|
+
def _generate_null_report(df: pl.DataFrame, save_dir: Path, filename: str):
|
|
247
|
+
"""
|
|
248
|
+
Internal helper to generate and save a CSV report of missing data percentages using Polars.
|
|
249
|
+
"""
|
|
250
|
+
total_rows = df.height
|
|
251
|
+
if total_rows == 0:
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
null_stats = df.null_count()
|
|
255
|
+
|
|
256
|
+
# Construct a report DataFrame
|
|
257
|
+
report = pl.DataFrame({
|
|
258
|
+
"column": df.columns,
|
|
259
|
+
"null_count": null_stats.transpose().to_series(),
|
|
260
|
+
}).with_columns(
|
|
261
|
+
(pl.col("null_count") / total_rows * 100).round(2).alias("missing_percent")
|
|
262
|
+
).sort("missing_percent", descending=True)
|
|
263
|
+
|
|
264
|
+
save_dataframe_filename(df=report, save_dir=save_dir, filename=filename)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def drop_macro_polars(df: pl.DataFrame,
|
|
268
|
+
log_directory: Path,
|
|
269
|
+
targets: list[str],
|
|
270
|
+
skip_targets: bool,
|
|
271
|
+
threshold: float) -> pl.DataFrame:
|
|
272
|
+
"""
|
|
273
|
+
High-performance implementation of iterative row/column pruning using Polars.
|
|
274
|
+
Includes temporary Pandas conversion for visualization.
|
|
275
|
+
"""
|
|
276
|
+
df_clean = df.clone()
|
|
277
|
+
|
|
278
|
+
# --- Helper to generate plot safely ---
|
|
279
|
+
def _plot_safe(df_pl: pl.DataFrame, filename: str):
|
|
280
|
+
try:
|
|
281
|
+
# converting to pandas just for the plot
|
|
282
|
+
# use_pyarrow_extension_array=True is faster
|
|
283
|
+
df_pd = df_pl.to_pandas(use_pyarrow_extension_array=True)
|
|
284
|
+
show_null_columns(df_pd, plot_to_dir=log_directory, plot_filename=filename, use_all_columns=True)
|
|
285
|
+
except Exception as e:
|
|
286
|
+
_LOGGER.warning(f"Skipping plot generation due to error: {e}")
|
|
287
|
+
|
|
288
|
+
# 1. Log Initial State
|
|
289
|
+
_generate_null_report(df_clean, log_directory, "Missing_Data_Original")
|
|
290
|
+
_plot_safe(df_clean, "Original")
|
|
291
|
+
|
|
292
|
+
master = True
|
|
293
|
+
while master:
|
|
294
|
+
initial_rows, initial_cols = df_clean.shape
|
|
295
|
+
|
|
296
|
+
# --- A. Drop Constant Columns ---
|
|
297
|
+
# Keep columns where n_unique > 1.
|
|
298
|
+
# Note: n_unique in Polars ignores nulls by default (similar to pandas dropna=True).
|
|
299
|
+
# We assume if a column is all nulls, it should also be dropped (n_unique=0).
|
|
300
|
+
cols_to_keep = [
|
|
301
|
+
col for col in df_clean.columns
|
|
302
|
+
if df_clean[col].n_unique() > 1
|
|
303
|
+
]
|
|
304
|
+
df_clean = df_clean.select(cols_to_keep)
|
|
305
|
+
|
|
306
|
+
# --- B. Drop Rows (Targets) ---
|
|
307
|
+
# Drop rows where ALL target columns are null
|
|
308
|
+
valid_targets = [t for t in targets if t in df_clean.columns]
|
|
309
|
+
if valid_targets:
|
|
310
|
+
df_clean = df_clean.filter(
|
|
311
|
+
~pl.all_horizontal(pl.col(valid_targets).is_null())
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# --- C. Drop Rows (Features Threshold) ---
|
|
315
|
+
# Drop rows where missing data fraction in FEATURE columns > threshold
|
|
316
|
+
feature_cols = [c for c in df_clean.columns if c not in valid_targets]
|
|
317
|
+
if feature_cols:
|
|
318
|
+
# We want to KEEP rows where (null_count / total_features) <= threshold
|
|
319
|
+
df_clean = df_clean.filter(
|
|
320
|
+
(pl.sum_horizontal(pl.col(feature_cols).is_null()) / len(feature_cols)) <= threshold
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# --- D. Drop Columns (Threshold) ---
|
|
324
|
+
# Drop columns where missing data fraction > threshold
|
|
325
|
+
current_height = df_clean.height
|
|
326
|
+
if current_height > 0:
|
|
327
|
+
null_counts = df_clean.null_count().row(0) # tuple of counts
|
|
328
|
+
cols_to_drop = []
|
|
329
|
+
|
|
330
|
+
for col_idx, col_name in enumerate(df_clean.columns):
|
|
331
|
+
# Check if we should skip this column (if it's a target and skip_targets=True)
|
|
332
|
+
if skip_targets and col_name in valid_targets:
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
missing_frac = null_counts[col_idx] / current_height
|
|
336
|
+
if missing_frac > threshold:
|
|
337
|
+
cols_to_drop.append(col_name)
|
|
338
|
+
|
|
339
|
+
if cols_to_drop:
|
|
340
|
+
df_clean = df_clean.drop(cols_to_drop)
|
|
341
|
+
|
|
342
|
+
# --- E. Check Convergence ---
|
|
343
|
+
remaining_rows, remaining_cols = df_clean.shape
|
|
344
|
+
if remaining_rows >= initial_rows and remaining_cols >= initial_cols:
|
|
345
|
+
master = False
|
|
346
|
+
|
|
347
|
+
# 2. Log Final State
|
|
348
|
+
_generate_null_report(df_clean, log_directory, "Missing_Data_Processed")
|
|
349
|
+
_plot_safe(df_clean, "Processed")
|
|
350
|
+
|
|
351
|
+
return df_clean
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Union, Optional
|
|
4
|
+
|
|
5
|
+
from ..utilities import load_dataframe
|
|
6
|
+
|
|
7
|
+
from ..path_manager import sanitize_filename, make_fullpath
|
|
8
|
+
from .._core import get_logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_LOGGER = get_logger("ETL Clean Tools")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"save_unique_values",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
################ Unique Values per column #################
|
|
20
|
+
def save_unique_values(csv_path_or_df: Union[str, Path, pl.DataFrame],
|
|
21
|
+
output_dir: Union[str, Path],
|
|
22
|
+
use_columns: Optional[list[str]] = None,
|
|
23
|
+
verbose: bool=False,
|
|
24
|
+
keep_column_order: bool = True,
|
|
25
|
+
add_value_separator: bool = False) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Loads a CSV file or Polars DataFrame, then analyzes it and saves the unique non-null values
|
|
28
|
+
from each column into a separate text file exactly as they appear.
|
|
29
|
+
|
|
30
|
+
This is useful for understanding the raw categories or range of values
|
|
31
|
+
within a dataset before and after cleaning.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
csv_path_or_df (str | Path | pl.DataFrame):
|
|
35
|
+
The file path to the input CSV file or a Polars DataFrame.
|
|
36
|
+
output_dir (str | Path):
|
|
37
|
+
The path to the directory where the .txt files will be saved.
|
|
38
|
+
The directory will be created if it does not exist.
|
|
39
|
+
keep_column_order (bool):
|
|
40
|
+
If True, prepends a numeric prefix to each
|
|
41
|
+
output filename to maintain the original column order.
|
|
42
|
+
add_value_separator (bool):
|
|
43
|
+
If True, adds a separator line between each unique value.
|
|
44
|
+
use_columns (List[str] | None):
|
|
45
|
+
If provided, only these columns will be processed. If None, all columns will be processed.
|
|
46
|
+
verbose (bool):
|
|
47
|
+
If True, prints the number of unique values saved for each column.
|
|
48
|
+
"""
|
|
49
|
+
# 1 Handle input DataFrame or path
|
|
50
|
+
if isinstance(csv_path_or_df, pl.DataFrame):
|
|
51
|
+
df = csv_path_or_df
|
|
52
|
+
if use_columns is not None:
|
|
53
|
+
# Validate columns exist
|
|
54
|
+
valid_cols = [c for c in use_columns if c in df.columns]
|
|
55
|
+
if not valid_cols:
|
|
56
|
+
_LOGGER.error("None of the specified columns in 'use_columns' exist in the provided DataFrame.")
|
|
57
|
+
raise ValueError()
|
|
58
|
+
df = df.select(valid_cols)
|
|
59
|
+
else:
|
|
60
|
+
csv_path = make_fullpath(input_path=csv_path_or_df, enforce="file")
|
|
61
|
+
df = load_dataframe(df_path=csv_path, use_columns=use_columns, kind="polars", all_strings=True)[0]
|
|
62
|
+
|
|
63
|
+
output_dir = make_fullpath(input_path=output_dir, make=True, enforce='directory')
|
|
64
|
+
|
|
65
|
+
if df.height == 0:
|
|
66
|
+
_LOGGER.warning("The input DataFrame is empty. No unique values to save.")
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
# --- 2. Process Each Column ---
|
|
70
|
+
counter = 0
|
|
71
|
+
|
|
72
|
+
# Iterate over columns using Polars methods
|
|
73
|
+
for i, column_name in enumerate(df.columns):
|
|
74
|
+
try:
|
|
75
|
+
col_expr = pl.col(column_name)
|
|
76
|
+
|
|
77
|
+
# Check if the column is string-based (String or Utf8)
|
|
78
|
+
dtype = df.schema[column_name]
|
|
79
|
+
if dtype in (pl.String, pl.Utf8):
|
|
80
|
+
# Filter out actual empty strings AND whitespace-only strings
|
|
81
|
+
dataset = df.select(col_expr).filter(
|
|
82
|
+
col_expr.str.strip_chars().str.len_chars() > 0
|
|
83
|
+
)
|
|
84
|
+
else:
|
|
85
|
+
dataset = df.select(col_expr)
|
|
86
|
+
|
|
87
|
+
# Efficiently get unique non-null values and sort them
|
|
88
|
+
unique_series = dataset.drop_nulls().unique().sort(column_name)
|
|
89
|
+
|
|
90
|
+
# Convert to a python list for writing
|
|
91
|
+
sorted_uniques = unique_series.to_series().to_list()
|
|
92
|
+
|
|
93
|
+
except Exception:
|
|
94
|
+
_LOGGER.error(f"Could not process column '{column_name}'.")
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
if not sorted_uniques:
|
|
98
|
+
_LOGGER.warning(f"Column '{column_name}' has no unique non-null values. Skipping.")
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
# --- 3. Filename Generation ---
|
|
102
|
+
sanitized_name = sanitize_filename(column_name)
|
|
103
|
+
if not sanitized_name.strip('_'):
|
|
104
|
+
sanitized_name = f'column_{i}'
|
|
105
|
+
|
|
106
|
+
prefix = f"{i + 1}_" if keep_column_order else ''
|
|
107
|
+
file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
|
|
108
|
+
|
|
109
|
+
# --- 4. Write to File ---
|
|
110
|
+
try:
|
|
111
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
112
|
+
f.write(f"# Unique values for column: '{column_name}'\n")
|
|
113
|
+
f.write(f"# Total unique non-null values: {len(sorted_uniques)}\n")
|
|
114
|
+
f.write("-" * 30 + "\n")
|
|
115
|
+
|
|
116
|
+
for value in sorted_uniques:
|
|
117
|
+
f.write(f"{value}\n")
|
|
118
|
+
if add_value_separator:
|
|
119
|
+
f.write("-" * 30 + "\n")
|
|
120
|
+
|
|
121
|
+
except IOError:
|
|
122
|
+
_LOGGER.exception(f"Error writing to file {file_path}.")
|
|
123
|
+
else:
|
|
124
|
+
if verbose:
|
|
125
|
+
print(f" Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
|
|
126
|
+
counter += 1
|
|
127
|
+
|
|
128
|
+
_LOGGER.info(f"{counter} files of unique values created.")
|