dragon-ml-toolbox 19.13.0__py3-none-any.whl → 20.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
- dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
- ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
- ml_tools/ETL_cleaning/_basic_clean.py +351 -0
- ml_tools/ETL_cleaning/_clean_tools.py +128 -0
- ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
- ml_tools/ETL_cleaning/_imprimir.py +13 -0
- ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
- ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
- ml_tools/ETL_engineering/_imprimir.py +24 -0
- ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
- ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
- ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
- ml_tools/GUI_tools/_imprimir.py +12 -0
- ml_tools/IO_tools/_IO_loggers.py +235 -0
- ml_tools/IO_tools/_IO_save_load.py +151 -0
- ml_tools/IO_tools/_IO_utils.py +140 -0
- ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
- ml_tools/IO_tools/_imprimir.py +14 -0
- ml_tools/MICE/_MICE_imputation.py +132 -0
- ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
- ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
- ml_tools/MICE/_imprimir.py +11 -0
- ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
- ml_tools/ML_callbacks/_base.py +101 -0
- ml_tools/ML_callbacks/_checkpoint.py +232 -0
- ml_tools/ML_callbacks/_early_stop.py +208 -0
- ml_tools/ML_callbacks/_imprimir.py +12 -0
- ml_tools/ML_callbacks/_scheduler.py +197 -0
- ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
- ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
- ml_tools/ML_chain/_dragon_chain.py +140 -0
- ml_tools/ML_chain/_imprimir.py +11 -0
- ml_tools/ML_configuration/__init__.py +90 -0
- ml_tools/ML_configuration/_base_model_config.py +69 -0
- ml_tools/ML_configuration/_finalize.py +366 -0
- ml_tools/ML_configuration/_imprimir.py +47 -0
- ml_tools/ML_configuration/_metrics.py +593 -0
- ml_tools/ML_configuration/_models.py +206 -0
- ml_tools/ML_configuration/_training.py +124 -0
- ml_tools/ML_datasetmaster/__init__.py +28 -0
- ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
- ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
- ml_tools/ML_datasetmaster/_imprimir.py +15 -0
- ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
- ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
- ml_tools/ML_evaluation/__init__.py +53 -0
- ml_tools/ML_evaluation/_classification.py +629 -0
- ml_tools/ML_evaluation/_feature_importance.py +409 -0
- ml_tools/ML_evaluation/_imprimir.py +25 -0
- ml_tools/ML_evaluation/_loss.py +92 -0
- ml_tools/ML_evaluation/_regression.py +273 -0
- ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
- ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
- ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
- ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
- ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
- ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
- ml_tools/ML_finalize_handler/__init__.py +10 -0
- ml_tools/ML_finalize_handler/_imprimir.py +8 -0
- ml_tools/ML_inference/__init__.py +22 -0
- ml_tools/ML_inference/_base_inference.py +166 -0
- ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
- ml_tools/ML_inference/_dragon_inference.py +332 -0
- ml_tools/ML_inference/_imprimir.py +11 -0
- ml_tools/ML_inference/_multi_inference.py +180 -0
- ml_tools/ML_inference_sequence/__init__.py +10 -0
- ml_tools/ML_inference_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
- ml_tools/ML_inference_vision/__init__.py +10 -0
- ml_tools/ML_inference_vision/_imprimir.py +8 -0
- ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
- ml_tools/ML_models/__init__.py +32 -0
- ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
- ml_tools/ML_models/_base_mlp_attention.py +198 -0
- ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
- ml_tools/ML_models/_dragon_tabular.py +248 -0
- ml_tools/ML_models/_imprimir.py +18 -0
- ml_tools/ML_models/_mlp_attention.py +134 -0
- ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
- ml_tools/ML_models_sequence/__init__.py +10 -0
- ml_tools/ML_models_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
- ml_tools/ML_models_vision/__init__.py +29 -0
- ml_tools/ML_models_vision/_base_wrapper.py +254 -0
- ml_tools/ML_models_vision/_image_classification.py +182 -0
- ml_tools/ML_models_vision/_image_segmentation.py +108 -0
- ml_tools/ML_models_vision/_imprimir.py +16 -0
- ml_tools/ML_models_vision/_object_detection.py +135 -0
- ml_tools/ML_optimization/__init__.py +21 -0
- ml_tools/ML_optimization/_imprimir.py +13 -0
- ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
- ml_tools/ML_optimization/_single_dragon.py +203 -0
- ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
- ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
- ml_tools/ML_scaler/__init__.py +10 -0
- ml_tools/ML_scaler/_imprimir.py +8 -0
- ml_tools/ML_trainer/__init__.py +20 -0
- ml_tools/ML_trainer/_base_trainer.py +297 -0
- ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
- ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
- ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
- ml_tools/ML_trainer/_imprimir.py +10 -0
- ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
- ml_tools/ML_utilities/_artifact_finder.py +382 -0
- ml_tools/ML_utilities/_imprimir.py +16 -0
- ml_tools/ML_utilities/_inspection.py +325 -0
- ml_tools/ML_utilities/_train_tools.py +205 -0
- ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
- ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
- ml_tools/ML_vision_transformers/_imprimir.py +14 -0
- ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
- ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
- ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
- ml_tools/PSO_optimization/_imprimir.py +10 -0
- ml_tools/SQL/__init__.py +7 -0
- ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
- ml_tools/SQL/_imprimir.py +8 -0
- ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
- ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
- ml_tools/VIF/_imprimir.py +10 -0
- ml_tools/_core/__init__.py +7 -1
- ml_tools/_core/_logger.py +8 -18
- ml_tools/_core/_schema_load_ops.py +43 -0
- ml_tools/_core/_script_info.py +2 -2
- ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
- ml_tools/data_exploration/_analysis.py +214 -0
- ml_tools/data_exploration/_cleaning.py +566 -0
- ml_tools/data_exploration/_features.py +583 -0
- ml_tools/data_exploration/_imprimir.py +32 -0
- ml_tools/data_exploration/_plotting.py +487 -0
- ml_tools/data_exploration/_schema_ops.py +176 -0
- ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
- ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
- ml_tools/ensemble_evaluation/_imprimir.py +14 -0
- ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
- ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
- ml_tools/ensemble_inference/_imprimir.py +9 -0
- ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
- ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
- ml_tools/ensemble_learning/_imprimir.py +10 -0
- ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
- ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
- ml_tools/excel_handler/_imprimir.py +13 -0
- ml_tools/{keys.py → keys/__init__.py} +4 -1
- ml_tools/keys/_imprimir.py +11 -0
- ml_tools/{_core → keys}/_keys.py +2 -0
- ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
- ml_tools/math_utilities/_imprimir.py +11 -0
- ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
- ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
- ml_tools/optimization_tools/_imprimir.py +13 -0
- ml_tools/optimization_tools/_optimization_bounds.py +236 -0
- ml_tools/optimization_tools/_optimization_plots.py +218 -0
- ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
- ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
- ml_tools/path_manager/_imprimir.py +15 -0
- ml_tools/path_manager/_path_tools.py +346 -0
- ml_tools/plot_fonts/__init__.py +8 -0
- ml_tools/plot_fonts/_imprimir.py +8 -0
- ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
- ml_tools/schema/__init__.py +15 -0
- ml_tools/schema/_feature_schema.py +223 -0
- ml_tools/schema/_gui_schema.py +191 -0
- ml_tools/schema/_imprimir.py +10 -0
- ml_tools/{serde.py → serde/__init__.py} +4 -2
- ml_tools/serde/_imprimir.py +10 -0
- ml_tools/{_core → serde}/_serde.py +3 -8
- ml_tools/{utilities.py → utilities/__init__.py} +11 -6
- ml_tools/utilities/_imprimir.py +18 -0
- ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
- ml_tools/utilities/_utility_tools.py +192 -0
- dragon_ml_toolbox-19.13.0.dist-info/RECORD +0 -111
- ml_tools/ML_chaining_inference.py +0 -8
- ml_tools/ML_configuration.py +0 -86
- ml_tools/ML_configuration_pytab.py +0 -14
- ml_tools/ML_datasetmaster.py +0 -10
- ml_tools/ML_evaluation.py +0 -16
- ml_tools/ML_evaluation_multi.py +0 -12
- ml_tools/ML_finalize_handler.py +0 -8
- ml_tools/ML_inference.py +0 -12
- ml_tools/ML_models.py +0 -14
- ml_tools/ML_models_advanced.py +0 -14
- ml_tools/ML_models_pytab.py +0 -14
- ml_tools/ML_optimization.py +0 -14
- ml_tools/ML_optimization_pareto.py +0 -8
- ml_tools/ML_scaler.py +0 -8
- ml_tools/ML_sequence_datasetmaster.py +0 -8
- ml_tools/ML_sequence_evaluation.py +0 -10
- ml_tools/ML_sequence_inference.py +0 -8
- ml_tools/ML_sequence_models.py +0 -8
- ml_tools/ML_trainer.py +0 -12
- ml_tools/ML_vision_datasetmaster.py +0 -12
- ml_tools/ML_vision_evaluation.py +0 -10
- ml_tools/ML_vision_inference.py +0 -8
- ml_tools/ML_vision_models.py +0 -18
- ml_tools/SQL.py +0 -8
- ml_tools/_core/_ETL_cleaning.py +0 -694
- ml_tools/_core/_IO_tools.py +0 -498
- ml_tools/_core/_ML_callbacks.py +0 -702
- ml_tools/_core/_ML_configuration.py +0 -1332
- ml_tools/_core/_ML_configuration_pytab.py +0 -102
- ml_tools/_core/_ML_evaluation.py +0 -867
- ml_tools/_core/_ML_evaluation_multi.py +0 -544
- ml_tools/_core/_ML_inference.py +0 -646
- ml_tools/_core/_ML_models.py +0 -668
- ml_tools/_core/_ML_models_pytab.py +0 -693
- ml_tools/_core/_ML_trainer.py +0 -2323
- ml_tools/_core/_ML_utilities.py +0 -886
- ml_tools/_core/_ML_vision_models.py +0 -644
- ml_tools/_core/_data_exploration.py +0 -1901
- ml_tools/_core/_optimization_tools.py +0 -493
- ml_tools/_core/_schema.py +0 -359
- ml_tools/plot_fonts.py +0 -8
- ml_tools/schema.py +0 -12
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
ml_tools/_core/_ETL_cleaning.py
DELETED
|
@@ -1,694 +0,0 @@
|
|
|
1
|
-
import polars as pl
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import Union, List, Dict, Optional
|
|
4
|
-
|
|
5
|
-
from ._path_manager import sanitize_filename, make_fullpath
|
|
6
|
-
from ._data_exploration import show_null_columns
|
|
7
|
-
from ._utilities import save_dataframe_filename, load_dataframe
|
|
8
|
-
from ._script_info import _script_info
|
|
9
|
-
from ._logger import get_logger
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
_LOGGER = get_logger("ETL Cleaning")
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
__all__ = [
|
|
16
|
-
"DragonColumnCleaner",
|
|
17
|
-
"DragonDataFrameCleaner",
|
|
18
|
-
"save_unique_values",
|
|
19
|
-
"basic_clean",
|
|
20
|
-
"basic_clean_drop",
|
|
21
|
-
"drop_macro_polars",
|
|
22
|
-
]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
################ Unique Values per column #################
|
|
26
|
-
def save_unique_values(csv_path_or_df: Union[str, Path, pl.DataFrame],
|
|
27
|
-
output_dir: Union[str, Path],
|
|
28
|
-
use_columns: Optional[List[str]] = None,
|
|
29
|
-
verbose: bool=False,
|
|
30
|
-
keep_column_order: bool = True,
|
|
31
|
-
add_value_separator: bool = False) -> None:
|
|
32
|
-
"""
|
|
33
|
-
Loads a CSV file or Polars DataFrame, then analyzes it and saves the unique non-null values
|
|
34
|
-
from each column into a separate text file exactly as they appear.
|
|
35
|
-
|
|
36
|
-
This is useful for understanding the raw categories or range of values
|
|
37
|
-
within a dataset before and after cleaning.
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
csv_path_or_df (str | Path | pl.DataFrame):
|
|
41
|
-
The file path to the input CSV file or a Polars DataFrame.
|
|
42
|
-
output_dir (str | Path):
|
|
43
|
-
The path to the directory where the .txt files will be saved.
|
|
44
|
-
The directory will be created if it does not exist.
|
|
45
|
-
keep_column_order (bool):
|
|
46
|
-
If True, prepends a numeric prefix to each
|
|
47
|
-
output filename to maintain the original column order.
|
|
48
|
-
add_value_separator (bool):
|
|
49
|
-
If True, adds a separator line between each unique value.
|
|
50
|
-
use_columns (List[str] | None):
|
|
51
|
-
If provided, only these columns will be processed. If None, all columns will be processed.
|
|
52
|
-
verbose (bool):
|
|
53
|
-
If True, prints the number of unique values saved for each column.
|
|
54
|
-
"""
|
|
55
|
-
# 1 Handle input DataFrame or path
|
|
56
|
-
if isinstance(csv_path_or_df, pl.DataFrame):
|
|
57
|
-
df = csv_path_or_df
|
|
58
|
-
if use_columns is not None:
|
|
59
|
-
# Validate columns exist
|
|
60
|
-
valid_cols = [c for c in use_columns if c in df.columns]
|
|
61
|
-
if not valid_cols:
|
|
62
|
-
_LOGGER.error("None of the specified columns in 'use_columns' exist in the provided DataFrame.")
|
|
63
|
-
raise ValueError()
|
|
64
|
-
df = df.select(valid_cols)
|
|
65
|
-
else:
|
|
66
|
-
csv_path = make_fullpath(input_path=csv_path_or_df, enforce="file")
|
|
67
|
-
df = load_dataframe(df_path=csv_path, use_columns=use_columns, kind="polars", all_strings=True)[0]
|
|
68
|
-
|
|
69
|
-
output_dir = make_fullpath(input_path=output_dir, make=True, enforce='directory')
|
|
70
|
-
|
|
71
|
-
if df.height == 0:
|
|
72
|
-
_LOGGER.warning("The input DataFrame is empty. No unique values to save.")
|
|
73
|
-
return
|
|
74
|
-
|
|
75
|
-
# --- 2. Process Each Column ---
|
|
76
|
-
counter = 0
|
|
77
|
-
|
|
78
|
-
# Iterate over columns using Polars methods
|
|
79
|
-
for i, column_name in enumerate(df.columns):
|
|
80
|
-
try:
|
|
81
|
-
col_expr = pl.col(column_name)
|
|
82
|
-
|
|
83
|
-
# Check if the column is string-based (String or Utf8)
|
|
84
|
-
dtype = df.schema[column_name]
|
|
85
|
-
if dtype in (pl.String, pl.Utf8):
|
|
86
|
-
# Filter out actual empty strings AND whitespace-only strings
|
|
87
|
-
dataset = df.select(col_expr).filter(
|
|
88
|
-
col_expr.str.strip_chars().str.len_chars() > 0
|
|
89
|
-
)
|
|
90
|
-
else:
|
|
91
|
-
dataset = df.select(col_expr)
|
|
92
|
-
|
|
93
|
-
# Efficiently get unique non-null values and sort them
|
|
94
|
-
unique_series = dataset.drop_nulls().unique().sort(column_name)
|
|
95
|
-
|
|
96
|
-
# Convert to a python list for writing
|
|
97
|
-
sorted_uniques = unique_series.to_series().to_list()
|
|
98
|
-
|
|
99
|
-
except Exception:
|
|
100
|
-
_LOGGER.error(f"Could not process column '{column_name}'.")
|
|
101
|
-
continue
|
|
102
|
-
|
|
103
|
-
if not sorted_uniques:
|
|
104
|
-
_LOGGER.warning(f"Column '{column_name}' has no unique non-null values. Skipping.")
|
|
105
|
-
continue
|
|
106
|
-
|
|
107
|
-
# --- 3. Filename Generation ---
|
|
108
|
-
sanitized_name = sanitize_filename(column_name)
|
|
109
|
-
if not sanitized_name.strip('_'):
|
|
110
|
-
sanitized_name = f'column_{i}'
|
|
111
|
-
|
|
112
|
-
prefix = f"{i + 1}_" if keep_column_order else ''
|
|
113
|
-
file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
|
|
114
|
-
|
|
115
|
-
# --- 4. Write to File ---
|
|
116
|
-
try:
|
|
117
|
-
with open(file_path, 'w', encoding='utf-8') as f:
|
|
118
|
-
f.write(f"# Unique values for column: '{column_name}'\n")
|
|
119
|
-
f.write(f"# Total unique non-null values: {len(sorted_uniques)}\n")
|
|
120
|
-
f.write("-" * 30 + "\n")
|
|
121
|
-
|
|
122
|
-
for value in sorted_uniques:
|
|
123
|
-
f.write(f"{value}\n")
|
|
124
|
-
if add_value_separator:
|
|
125
|
-
f.write("-" * 30 + "\n")
|
|
126
|
-
|
|
127
|
-
except IOError:
|
|
128
|
-
_LOGGER.exception(f"Error writing to file {file_path}.")
|
|
129
|
-
else:
|
|
130
|
-
if verbose:
|
|
131
|
-
print(f" Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
|
|
132
|
-
counter += 1
|
|
133
|
-
|
|
134
|
-
_LOGGER.info(f"{counter} files of unique values created.")
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
########## Basic df cleaners #############
|
|
138
|
-
def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
|
|
139
|
-
# Cleaning rules
|
|
140
|
-
cleaning_rules = {
|
|
141
|
-
# 1. Comprehensive Punctuation & Symbol Normalization
|
|
142
|
-
# Remove invisible control characters
|
|
143
|
-
r'\p{C}+': '',
|
|
144
|
-
|
|
145
|
-
# Full-width to half-width
|
|
146
|
-
# Numbers
|
|
147
|
-
'0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
|
|
148
|
-
'5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
|
|
149
|
-
# Superscripts & Subscripts
|
|
150
|
-
'¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5',
|
|
151
|
-
'⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
|
|
152
|
-
'₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
|
|
153
|
-
'₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
|
|
154
|
-
'⁺': '', '⁻': '', '₊': '', '₋': '',
|
|
155
|
-
# Uppercase Alphabet
|
|
156
|
-
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
|
|
157
|
-
'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
|
|
158
|
-
'M': 'M', 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R',
|
|
159
|
-
'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X',
|
|
160
|
-
'Y': 'Y', 'Z': 'Z',
|
|
161
|
-
# Lowercase Alphabet
|
|
162
|
-
'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f',
|
|
163
|
-
'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l',
|
|
164
|
-
'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r',
|
|
165
|
-
's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
|
|
166
|
-
'y': 'y', 'z': 'z',
|
|
167
|
-
# Punctuation
|
|
168
|
-
'》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']', '∼': '~',
|
|
169
|
-
'(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
|
|
170
|
-
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '', '⋅': '',
|
|
171
|
-
'¯': '-', '_': '-',
|
|
172
|
-
|
|
173
|
-
# Commas (avoid commas in entries)
|
|
174
|
-
',': ';',
|
|
175
|
-
',': ';',
|
|
176
|
-
'、':';',
|
|
177
|
-
|
|
178
|
-
# Others
|
|
179
|
-
'σ': '',
|
|
180
|
-
'□': '',
|
|
181
|
-
'©': '',
|
|
182
|
-
'®': '',
|
|
183
|
-
'™': '',
|
|
184
|
-
r'[°˚]': '',
|
|
185
|
-
|
|
186
|
-
# Replace special characters in entries
|
|
187
|
-
r'\\': '_',
|
|
188
|
-
|
|
189
|
-
# Typographical standardization
|
|
190
|
-
# Unify various dashes and hyphens to a standard hyphen
|
|
191
|
-
r'[—–―]': '-',
|
|
192
|
-
r'−': '-',
|
|
193
|
-
# remove various quote types
|
|
194
|
-
r'[“”"]': '',
|
|
195
|
-
r"[‘’′']": '',
|
|
196
|
-
|
|
197
|
-
# Collapse repeating punctuation
|
|
198
|
-
r'\.{2,}': '.', # Replace two or more dots with a single dot
|
|
199
|
-
r'\?{2,}': '?', # Replace two or more question marks with a single question mark
|
|
200
|
-
r'!{2,}': '!', # Replace two or more exclamation marks with a single one
|
|
201
|
-
r';{2,}': ';',
|
|
202
|
-
r'-{2,}': '-',
|
|
203
|
-
r'/{2,}': '/',
|
|
204
|
-
r'%{2,}': '%',
|
|
205
|
-
r'&{2,}': '&',
|
|
206
|
-
|
|
207
|
-
# 2. Internal Whitespace Consolidation
|
|
208
|
-
# Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
|
|
209
|
-
r'\s+': ' ',
|
|
210
|
-
|
|
211
|
-
# 3. Leading/Trailing Whitespace Removal
|
|
212
|
-
# Strip any whitespace from the beginning or end of the string
|
|
213
|
-
r'^\s+|\s+$': '',
|
|
214
|
-
|
|
215
|
-
# 4. Textual Null Standardization (New Step)
|
|
216
|
-
# Convert common null-like text to actual nulls.
|
|
217
|
-
r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
|
|
218
|
-
|
|
219
|
-
# 5. Final Nullification of Empty Strings
|
|
220
|
-
# After all cleaning, if a string is now empty, convert it to a null
|
|
221
|
-
r'^\s*$': None,
|
|
222
|
-
r'^$': None,
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
# Clean data
|
|
226
|
-
try:
|
|
227
|
-
# Create a cleaner for every column in the dataframe
|
|
228
|
-
all_columns = df_in.columns
|
|
229
|
-
column_cleaners = [
|
|
230
|
-
DragonColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
|
|
231
|
-
]
|
|
232
|
-
|
|
233
|
-
# Instantiate and run the main dataframe cleaner
|
|
234
|
-
df_cleaner = DragonDataFrameCleaner(cleaners=column_cleaners)
|
|
235
|
-
df_cleaned = df_cleaner.clean(df_in)
|
|
236
|
-
|
|
237
|
-
# apply lowercase to all string columns
|
|
238
|
-
if all_lowercase:
|
|
239
|
-
df_final = df_cleaned.with_columns(
|
|
240
|
-
pl.col(pl.String).str.to_lowercase()
|
|
241
|
-
)
|
|
242
|
-
else:
|
|
243
|
-
df_final = df_cleaned
|
|
244
|
-
|
|
245
|
-
except Exception as e:
|
|
246
|
-
_LOGGER.error(f"An error occurred during the cleaning process.")
|
|
247
|
-
raise e
|
|
248
|
-
else:
|
|
249
|
-
return df_final
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
def _local_path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
|
|
253
|
-
# Handle paths
|
|
254
|
-
input_path = make_fullpath(path_in, enforce="file")
|
|
255
|
-
|
|
256
|
-
parent_dir = make_fullpath(Path(path_out).parent, make=True, enforce="directory")
|
|
257
|
-
output_path = parent_dir / Path(path_out).name
|
|
258
|
-
|
|
259
|
-
return input_path, output_path
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=False):
|
|
263
|
-
"""
|
|
264
|
-
Performs a comprehensive, standardized cleaning on all columns of a CSV file.
|
|
265
|
-
|
|
266
|
-
The cleaning process includes:
|
|
267
|
-
- Normalizing full-width and typographical punctuation to standard equivalents.
|
|
268
|
-
- Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
|
|
269
|
-
- Stripping any leading or trailing whitespace.
|
|
270
|
-
- Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
|
|
271
|
-
- Converting strings that become empty after cleaning into true null values.
|
|
272
|
-
- Normalizing all text to lowercase (Optional).
|
|
273
|
-
|
|
274
|
-
Args:
|
|
275
|
-
input_filepath (str | Path):
|
|
276
|
-
The path to the source CSV file to be cleaned.
|
|
277
|
-
output_filepath (str | Path):
|
|
278
|
-
The path to save the cleaned CSV file.
|
|
279
|
-
all_lowercase (bool):
|
|
280
|
-
Whether to normalize all text to lowercase.
|
|
281
|
-
|
|
282
|
-
"""
|
|
283
|
-
# Handle paths
|
|
284
|
-
input_path, output_path = _local_path_manager(path_in=input_filepath, path_out=output_filepath)
|
|
285
|
-
|
|
286
|
-
# load polars df
|
|
287
|
-
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
288
|
-
|
|
289
|
-
# CLEAN
|
|
290
|
-
df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
|
|
291
|
-
|
|
292
|
-
# Save cleaned dataframe
|
|
293
|
-
save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
294
|
-
|
|
295
|
-
_LOGGER.info(f"Data successfully cleaned.")
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
def basic_clean_drop(input_filepath: Union[str,Path],
|
|
299
|
-
output_filepath: Union[str,Path],
|
|
300
|
-
log_directory: Union[str,Path],
|
|
301
|
-
targets: list[str],
|
|
302
|
-
skip_targets: bool=False,
|
|
303
|
-
threshold: float=0.8,
|
|
304
|
-
all_lowercase: bool=False):
|
|
305
|
-
"""
|
|
306
|
-
Performs standardized cleaning followed by iterative removal of rows and
|
|
307
|
-
columns with excessive missing data.
|
|
308
|
-
|
|
309
|
-
This function combines the functionality of `basic_clean` and `drop_macro_polars`. It first
|
|
310
|
-
applies a comprehensive normalization process to all columns in the input CSV file.
|
|
311
|
-
Then it applies iterative row and column dropping to remove redundant or incomplete data.
|
|
312
|
-
|
|
313
|
-
Args:
|
|
314
|
-
input_filepath (str | Path):
|
|
315
|
-
The path to the source CSV file to be cleaned.
|
|
316
|
-
output_filepath (str | Path):
|
|
317
|
-
The path to save the fully cleaned CSV file after cleaning
|
|
318
|
-
and missing-data-based pruning.
|
|
319
|
-
log_directory (str | Path):
|
|
320
|
-
Path to the directory where missing data reports will be stored.
|
|
321
|
-
targets (list[str]):
|
|
322
|
-
A list of column names to be treated as target variables.
|
|
323
|
-
This list guides the row-dropping logic.
|
|
324
|
-
skip_targets (bool):
|
|
325
|
-
If True, the columns listed in `targets` will be exempt from being dropped,
|
|
326
|
-
even if they exceed the missing data threshold.
|
|
327
|
-
threshold (float):
|
|
328
|
-
The proportion of missing data required to drop a row or column.
|
|
329
|
-
For example, 0.8 means a row/column will be dropped if 80% or more
|
|
330
|
-
of its data is missing.
|
|
331
|
-
all_lowercase (bool):
|
|
332
|
-
Whether to normalize all text to lowercase.
|
|
333
|
-
"""
|
|
334
|
-
# handle log path
|
|
335
|
-
log_path = make_fullpath(log_directory, make=True, enforce="directory")
|
|
336
|
-
|
|
337
|
-
# Handle df paths
|
|
338
|
-
input_path, output_path = _local_path_manager(path_in=input_filepath, path_out=output_filepath)
|
|
339
|
-
|
|
340
|
-
# load polars df
|
|
341
|
-
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
342
|
-
|
|
343
|
-
# CLEAN
|
|
344
|
-
df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
|
|
345
|
-
|
|
346
|
-
# Drop macro (Polars implementation)
|
|
347
|
-
df_final = drop_macro_polars(df=df_cleaned,
|
|
348
|
-
log_directory=log_path,
|
|
349
|
-
targets=targets,
|
|
350
|
-
skip_targets=skip_targets,
|
|
351
|
-
threshold=threshold)
|
|
352
|
-
|
|
353
|
-
# Save cleaned dataframe
|
|
354
|
-
save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
355
|
-
|
|
356
|
-
_LOGGER.info(f"Data successfully cleaned.")
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
########## EXTRACT and CLEAN ##########
|
|
360
|
-
class DragonColumnCleaner:
|
|
361
|
-
"""
|
|
362
|
-
A configuration object that defines cleaning rules for a single Polars DataFrame column.
|
|
363
|
-
|
|
364
|
-
This class holds a dictionary of regex-to-replacement rules, the target column name,
|
|
365
|
-
and the case-sensitivity setting. It is intended to be used with the DragonDataFrameCleaner.
|
|
366
|
-
|
|
367
|
-
Notes:
|
|
368
|
-
- Define rules from most specific to more general to create a fallback system.
|
|
369
|
-
- Beware of chain replacements (rules matching strings that have already been
|
|
370
|
-
changed by a previous rule in the same cleaner).
|
|
371
|
-
"""
|
|
372
|
-
def __init__(self,
|
|
373
|
-
column_name: str,
|
|
374
|
-
rules: Union[Dict[str, Union[str, None]], Dict[str, str]],
|
|
375
|
-
case_insensitive: bool = False):
|
|
376
|
-
"""
|
|
377
|
-
Args:
|
|
378
|
-
column_name (str):
|
|
379
|
-
The name of the column to be cleaned.
|
|
380
|
-
rules (Dict[str, str | None]):
|
|
381
|
-
A dictionary of regex patterns to replacement strings.
|
|
382
|
-
- Replacement can be None to indicate that matching values should be converted to null.
|
|
383
|
-
- Can use backreferences (e.g., r'$1 $2') for captured groups. Note that Polars uses a '$' prefix for backreferences.
|
|
384
|
-
case_insensitive (bool):
|
|
385
|
-
If True, regex matching ignores case.
|
|
386
|
-
|
|
387
|
-
## Usage Example
|
|
388
|
-
|
|
389
|
-
```python
|
|
390
|
-
id_rules = {
|
|
391
|
-
# Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
|
|
392
|
-
r'ID[- ](\\d+)': r'ID:$1'
|
|
393
|
-
}
|
|
394
|
-
|
|
395
|
-
id_cleaner = DragonColumnCleaner(column_name='user_id', rules=id_rules)
|
|
396
|
-
# This object would then be passed to a DragonDataFrameCleaner.
|
|
397
|
-
```
|
|
398
|
-
"""
|
|
399
|
-
if not isinstance(column_name, str) or not column_name:
|
|
400
|
-
_LOGGER.error("The 'column_name' must be a non-empty string.")
|
|
401
|
-
raise TypeError()
|
|
402
|
-
if not isinstance(rules, dict):
|
|
403
|
-
_LOGGER.error("The 'rules' argument must be a dictionary.")
|
|
404
|
-
raise TypeError()
|
|
405
|
-
# validate rules
|
|
406
|
-
for pattern, replacement in rules.items():
|
|
407
|
-
if not isinstance(pattern, str):
|
|
408
|
-
_LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
|
|
409
|
-
raise TypeError()
|
|
410
|
-
if replacement is not None and not isinstance(replacement, str):
|
|
411
|
-
_LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
|
|
412
|
-
raise TypeError()
|
|
413
|
-
|
|
414
|
-
self.column_name = column_name
|
|
415
|
-
self.rules = rules
|
|
416
|
-
self.case_insensitive = case_insensitive
|
|
417
|
-
|
|
418
|
-
def preview(self,
|
|
419
|
-
csv_path: Union[str, Path],
|
|
420
|
-
report_dir: Union[str, Path],
|
|
421
|
-
add_value_separator: bool=False,
|
|
422
|
-
rule_batch_size: int = 150):
|
|
423
|
-
"""
|
|
424
|
-
Generates a preview report of unique values in the specified column after applying the current cleaning rules.
|
|
425
|
-
|
|
426
|
-
Args:
|
|
427
|
-
csv_path (str | Path):
|
|
428
|
-
The path to the CSV file containing the data to clean.
|
|
429
|
-
report_dir (str | Path):
|
|
430
|
-
The directory where the preview report will be saved.
|
|
431
|
-
add_value_separator (bool):
|
|
432
|
-
If True, adds a separator line between each unique value in the report.
|
|
433
|
-
rule_batch_size (int):
|
|
434
|
-
Splits the regex rules into chunks of this size. Helps prevent memory errors.
|
|
435
|
-
"""
|
|
436
|
-
# Load DataFrame
|
|
437
|
-
df, _ = load_dataframe(df_path=csv_path, use_columns=[self.column_name], kind="polars", all_strings=True)
|
|
438
|
-
|
|
439
|
-
preview_cleaner = DragonDataFrameCleaner(cleaners=[self])
|
|
440
|
-
df_preview = preview_cleaner.clean(df, rule_batch_size=rule_batch_size)
|
|
441
|
-
|
|
442
|
-
# Apply cleaning rules to a copy of the column for preview
|
|
443
|
-
save_unique_values(csv_path_or_df=df_preview,
|
|
444
|
-
output_dir=report_dir,
|
|
445
|
-
use_columns=[self.column_name],
|
|
446
|
-
verbose=False,
|
|
447
|
-
keep_column_order=False,
|
|
448
|
-
add_value_separator=add_value_separator)
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
class DragonDataFrameCleaner:
|
|
452
|
-
"""
|
|
453
|
-
Orchestrates cleaning multiple columns in a Polars DataFrame.
|
|
454
|
-
"""
|
|
455
|
-
def __init__(self, cleaners: List[DragonColumnCleaner]):
|
|
456
|
-
"""
|
|
457
|
-
Takes a list of DragonColumnCleaner objects and applies their defined
|
|
458
|
-
rules to the corresponding columns of a DataFrame using high-performance
|
|
459
|
-
Polars expressions wit memory optimization.
|
|
460
|
-
|
|
461
|
-
Args:
|
|
462
|
-
cleaners (List[DragonColumnCleaner]):
|
|
463
|
-
A list of DragonColumnCleaner configuration objects.
|
|
464
|
-
"""
|
|
465
|
-
if not isinstance(cleaners, list):
|
|
466
|
-
_LOGGER.error("The 'cleaners' argument must be a list of DragonColumnCleaner objects.")
|
|
467
|
-
raise TypeError()
|
|
468
|
-
|
|
469
|
-
seen_columns = set()
|
|
470
|
-
for cleaner in cleaners:
|
|
471
|
-
if not isinstance(cleaner, DragonColumnCleaner):
|
|
472
|
-
_LOGGER.error(f"All items in 'cleaners' list must be DragonColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
|
|
473
|
-
raise TypeError()
|
|
474
|
-
if cleaner.column_name in seen_columns:
|
|
475
|
-
_LOGGER.error(f"Duplicate DragonColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
|
|
476
|
-
raise ValueError()
|
|
477
|
-
seen_columns.add(cleaner.column_name)
|
|
478
|
-
|
|
479
|
-
self.cleaners = cleaners
|
|
480
|
-
|
|
481
|
-
def clean(self, df: Union[pl.DataFrame, pl.LazyFrame],
|
|
482
|
-
rule_batch_size: int = 150) -> pl.DataFrame:
|
|
483
|
-
"""
|
|
484
|
-
Applies cleaning rules. Supports Lazy execution to handle OOM issues.
|
|
485
|
-
|
|
486
|
-
Args:
|
|
487
|
-
df (pl.DataFrame | pl.LazyFrame):
|
|
488
|
-
The data to clean.
|
|
489
|
-
rule_batch_size (int):
|
|
490
|
-
Splits the regex rules into chunks of this size. Helps prevent memory errors.
|
|
491
|
-
|
|
492
|
-
Returns:
|
|
493
|
-
pl.DataFrame: The cleaned, collected DataFrame.
|
|
494
|
-
"""
|
|
495
|
-
# 1. Validate Columns (Only if eager, or simple schema check if lazy)
|
|
496
|
-
# Note: For LazyFrames, we assume columns exist or let it fail at collection.
|
|
497
|
-
if isinstance(df, pl.DataFrame):
|
|
498
|
-
df_cols = set(df.columns)
|
|
499
|
-
rule_cols = {c.column_name for c in self.cleaners}
|
|
500
|
-
missing = rule_cols - df_cols
|
|
501
|
-
if missing:
|
|
502
|
-
_LOGGER.error(f"The following columns specified in cleaners are missing from the DataFrame: {missing}")
|
|
503
|
-
raise ValueError()
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
# lazy internally
|
|
507
|
-
lf = df.lazy()
|
|
508
|
-
else:
|
|
509
|
-
# It should be a LazyFrame, check type
|
|
510
|
-
if not isinstance(df, pl.LazyFrame):
|
|
511
|
-
_LOGGER.error("The 'df' argument must be a Polars DataFrame or LazyFrame.")
|
|
512
|
-
raise TypeError()
|
|
513
|
-
# It is already a LazyFrame
|
|
514
|
-
lf = df
|
|
515
|
-
|
|
516
|
-
# 2. Build Expression Chain
|
|
517
|
-
final_lf = lf
|
|
518
|
-
|
|
519
|
-
for cleaner in self.cleaners:
|
|
520
|
-
col_name = cleaner.column_name
|
|
521
|
-
|
|
522
|
-
# Get all rules as a list of items
|
|
523
|
-
all_rules = list(cleaner.rules.items())
|
|
524
|
-
|
|
525
|
-
# Process in batches of 'rule_batch_size'
|
|
526
|
-
for i in range(0, len(all_rules), rule_batch_size):
|
|
527
|
-
rule_batch = all_rules[i : i + rule_batch_size]
|
|
528
|
-
|
|
529
|
-
# Start expression for this batch
|
|
530
|
-
col_expr = pl.col(col_name).cast(pl.String)
|
|
531
|
-
|
|
532
|
-
for pattern, replacement in rule_batch:
|
|
533
|
-
final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
|
|
534
|
-
|
|
535
|
-
if replacement is None:
|
|
536
|
-
col_expr = pl.when(col_expr.str.contains(final_pattern)) \
|
|
537
|
-
.then(None) \
|
|
538
|
-
.otherwise(col_expr)
|
|
539
|
-
else:
|
|
540
|
-
col_expr = col_expr.str.replace_all(final_pattern, replacement)
|
|
541
|
-
|
|
542
|
-
# Apply this batch of rules to the LazyFrame
|
|
543
|
-
final_lf = final_lf.with_columns(col_expr.alias(col_name))
|
|
544
|
-
|
|
545
|
-
# 3. Collect Results
|
|
546
|
-
try:
|
|
547
|
-
return final_lf.collect(engine="streaming")
|
|
548
|
-
except Exception as e:
|
|
549
|
-
_LOGGER.error("An error occurred during the cleaning process.")
|
|
550
|
-
raise e
|
|
551
|
-
|
|
552
|
-
def load_clean_save(self,
|
|
553
|
-
input_filepath: Union[str,Path],
|
|
554
|
-
output_filepath: Union[str,Path],
|
|
555
|
-
rule_batch_size: int = 150):
|
|
556
|
-
"""
|
|
557
|
-
This convenience method encapsulates the entire cleaning process into a
|
|
558
|
-
single call. It loads a DataFrame from a specified file, applies all
|
|
559
|
-
cleaning rules configured in the `DragonDataFrameCleaner` instance, and saves
|
|
560
|
-
the resulting cleaned DataFrame to a new file.
|
|
561
|
-
|
|
562
|
-
The method ensures that all data is loaded as string types to prevent
|
|
563
|
-
unintended type inference issues before cleaning operations are applied.
|
|
564
|
-
|
|
565
|
-
Args:
|
|
566
|
-
input_filepath (Union[str, Path]):
|
|
567
|
-
The path to the input data file.
|
|
568
|
-
output_filepath (Union[str, Path]):
|
|
569
|
-
The full path, where the cleaned data file will be saved.
|
|
570
|
-
rule_batch_size (int):
|
|
571
|
-
Splits the regex rules into chunks of this size. Helps prevent memory errors.
|
|
572
|
-
"""
|
|
573
|
-
df, _ = load_dataframe(df_path=input_filepath, kind="polars", all_strings=True)
|
|
574
|
-
|
|
575
|
-
df_clean = self.clean(df=df, rule_batch_size=rule_batch_size)
|
|
576
|
-
|
|
577
|
-
if isinstance(output_filepath, str):
|
|
578
|
-
output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
|
|
579
|
-
|
|
580
|
-
save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
|
|
581
|
-
|
|
582
|
-
return None
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
def _generate_null_report(df: pl.DataFrame, save_dir: Path, filename: str):
|
|
586
|
-
"""
|
|
587
|
-
Internal helper to generate and save a CSV report of missing data percentages using Polars.
|
|
588
|
-
"""
|
|
589
|
-
total_rows = df.height
|
|
590
|
-
if total_rows == 0:
|
|
591
|
-
return
|
|
592
|
-
|
|
593
|
-
null_stats = df.null_count()
|
|
594
|
-
|
|
595
|
-
# Construct a report DataFrame
|
|
596
|
-
report = pl.DataFrame({
|
|
597
|
-
"column": df.columns,
|
|
598
|
-
"null_count": null_stats.transpose().to_series(),
|
|
599
|
-
}).with_columns(
|
|
600
|
-
(pl.col("null_count") / total_rows * 100).round(2).alias("missing_percent")
|
|
601
|
-
).sort("missing_percent", descending=True)
|
|
602
|
-
|
|
603
|
-
save_dataframe_filename(df=report, save_dir=save_dir, filename=filename)
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
def drop_macro_polars(df: pl.DataFrame,
|
|
607
|
-
log_directory: Path,
|
|
608
|
-
targets: list[str],
|
|
609
|
-
skip_targets: bool,
|
|
610
|
-
threshold: float) -> pl.DataFrame:
|
|
611
|
-
"""
|
|
612
|
-
High-performance implementation of iterative row/column pruning using Polars.
|
|
613
|
-
Includes temporary Pandas conversion for visualization.
|
|
614
|
-
"""
|
|
615
|
-
df_clean = df.clone()
|
|
616
|
-
|
|
617
|
-
# --- Helper to generate plot safely ---
|
|
618
|
-
def _plot_safe(df_pl: pl.DataFrame, filename: str):
|
|
619
|
-
try:
|
|
620
|
-
# converting to pandas just for the plot
|
|
621
|
-
# use_pyarrow_extension_array=True is faster
|
|
622
|
-
df_pd = df_pl.to_pandas(use_pyarrow_extension_array=True)
|
|
623
|
-
show_null_columns(df_pd, plot_to_dir=log_directory, plot_filename=filename, use_all_columns=True)
|
|
624
|
-
except Exception as e:
|
|
625
|
-
_LOGGER.warning(f"Skipping plot generation due to error: {e}")
|
|
626
|
-
|
|
627
|
-
# 1. Log Initial State
|
|
628
|
-
_generate_null_report(df_clean, log_directory, "Missing_Data_Original")
|
|
629
|
-
_plot_safe(df_clean, "Original")
|
|
630
|
-
|
|
631
|
-
master = True
|
|
632
|
-
while master:
|
|
633
|
-
initial_rows, initial_cols = df_clean.shape
|
|
634
|
-
|
|
635
|
-
# --- A. Drop Constant Columns ---
|
|
636
|
-
# Keep columns where n_unique > 1.
|
|
637
|
-
# Note: n_unique in Polars ignores nulls by default (similar to pandas dropna=True).
|
|
638
|
-
# We assume if a column is all nulls, it should also be dropped (n_unique=0).
|
|
639
|
-
cols_to_keep = [
|
|
640
|
-
col for col in df_clean.columns
|
|
641
|
-
if df_clean[col].n_unique() > 1
|
|
642
|
-
]
|
|
643
|
-
df_clean = df_clean.select(cols_to_keep)
|
|
644
|
-
|
|
645
|
-
# --- B. Drop Rows (Targets) ---
|
|
646
|
-
# Drop rows where ALL target columns are null
|
|
647
|
-
valid_targets = [t for t in targets if t in df_clean.columns]
|
|
648
|
-
if valid_targets:
|
|
649
|
-
df_clean = df_clean.filter(
|
|
650
|
-
~pl.all_horizontal(pl.col(valid_targets).is_null())
|
|
651
|
-
)
|
|
652
|
-
|
|
653
|
-
# --- C. Drop Rows (Features Threshold) ---
|
|
654
|
-
# Drop rows where missing data fraction in FEATURE columns > threshold
|
|
655
|
-
feature_cols = [c for c in df_clean.columns if c not in valid_targets]
|
|
656
|
-
if feature_cols:
|
|
657
|
-
# We want to KEEP rows where (null_count / total_features) <= threshold
|
|
658
|
-
df_clean = df_clean.filter(
|
|
659
|
-
(pl.sum_horizontal(pl.col(feature_cols).is_null()) / len(feature_cols)) <= threshold
|
|
660
|
-
)
|
|
661
|
-
|
|
662
|
-
# --- D. Drop Columns (Threshold) ---
|
|
663
|
-
# Drop columns where missing data fraction > threshold
|
|
664
|
-
current_height = df_clean.height
|
|
665
|
-
if current_height > 0:
|
|
666
|
-
null_counts = df_clean.null_count().row(0) # tuple of counts
|
|
667
|
-
cols_to_drop = []
|
|
668
|
-
|
|
669
|
-
for col_idx, col_name in enumerate(df_clean.columns):
|
|
670
|
-
# Check if we should skip this column (if it's a target and skip_targets=True)
|
|
671
|
-
if skip_targets and col_name in valid_targets:
|
|
672
|
-
continue
|
|
673
|
-
|
|
674
|
-
missing_frac = null_counts[col_idx] / current_height
|
|
675
|
-
if missing_frac > threshold:
|
|
676
|
-
cols_to_drop.append(col_name)
|
|
677
|
-
|
|
678
|
-
if cols_to_drop:
|
|
679
|
-
df_clean = df_clean.drop(cols_to_drop)
|
|
680
|
-
|
|
681
|
-
# --- E. Check Convergence ---
|
|
682
|
-
remaining_rows, remaining_cols = df_clean.shape
|
|
683
|
-
if remaining_rows >= initial_rows and remaining_cols >= initial_cols:
|
|
684
|
-
master = False
|
|
685
|
-
|
|
686
|
-
# 2. Log Final State
|
|
687
|
-
_generate_null_report(df_clean, log_directory, "Missing_Data_Processed")
|
|
688
|
-
_plot_safe(df_clean, "Processed")
|
|
689
|
-
|
|
690
|
-
return df_clean
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
def info():
|
|
694
|
-
_script_info(__all__)
|