dragon-ml-toolbox 20.5.0__py3-none-any.whl → 20.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/RECORD +14 -13
- ml_tools/ETL_cleaning/__init__.py +3 -1
- ml_tools/ETL_cleaning/_clean_tools.py +109 -0
- ml_tools/ETL_cleaning/_dragon_cleaner.py +72 -19
- ml_tools/ML_configuration/_metrics.py +16 -8
- ml_tools/ML_evaluation/_classification.py +76 -30
- ml_tools/keys/_keys.py +1 -0
- ml_tools/utilities/__init__.py +10 -0
- ml_tools/utilities/_translate.py +292 -0
- {dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
dragon_ml_toolbox-20.
|
|
2
|
-
dragon_ml_toolbox-20.
|
|
1
|
+
dragon_ml_toolbox-20.7.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-20.7.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
|
|
3
3
|
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
5
|
-
ml_tools/ETL_cleaning/__init__.py,sha256=
|
|
5
|
+
ml_tools/ETL_cleaning/__init__.py,sha256=gLRHF-qzwpqKTvbbn9chIQELeUDh_XGpBRX28j-5IqI,545
|
|
6
6
|
ml_tools/ETL_cleaning/_basic_clean.py,sha256=2_FhWP-xYgl8s51H3OjYb_sqsW2yX_QZ4kmyrKjbSsc,13892
|
|
7
|
-
ml_tools/ETL_cleaning/_clean_tools.py,sha256=
|
|
8
|
-
ml_tools/ETL_cleaning/_dragon_cleaner.py,sha256=
|
|
7
|
+
ml_tools/ETL_cleaning/_clean_tools.py,sha256=7aIC4w0CLK93E2nWC8h8YbI8bW_3Na9myD9VBMA-9zQ,9575
|
|
8
|
+
ml_tools/ETL_cleaning/_dragon_cleaner.py,sha256=WvDHtdQTQldYwRWkmr3MlqFgWPl8rrEHp6m1uqgH0ho,13291
|
|
9
9
|
ml_tools/ETL_engineering/__init__.py,sha256=EVIU0skxaH4ZDk8tEkOrxhTMSSA2LI_glhIpzFSxxlg,1007
|
|
10
10
|
ml_tools/ETL_engineering/_dragon_engineering.py,sha256=D-D6tmhyQ3I9-cXgxLVVbQBRTZoNsWaKPsvcTUaetws,10810
|
|
11
11
|
ml_tools/ETL_engineering/_transforms.py,sha256=qOxa_vjh3gzS4IiGFqq_0Wnh0ilQO41jRiIp-6Ej4vw,47079
|
|
@@ -30,7 +30,7 @@ ml_tools/ML_chain/_update_schema.py,sha256=z1Us7lv6hy6GwSu1mcid50Jmqq3sh91hMQ0Ln
|
|
|
30
30
|
ml_tools/ML_configuration/__init__.py,sha256=ogktFnYxz5jWJkhHS4DVaMldHkt3lT2gw9jx5PQ3d78,2755
|
|
31
31
|
ml_tools/ML_configuration/_base_model_config.py,sha256=95L3IfobNFMtnNr79zYpDGerC1q1v7M05tWZvTS2cwE,2247
|
|
32
32
|
ml_tools/ML_configuration/_finalize.py,sha256=l_n13bLu0avMdJ8hNRrH8V_wOBQZM1UGsTydKBkTysM,15047
|
|
33
|
-
ml_tools/ML_configuration/_metrics.py,sha256=
|
|
33
|
+
ml_tools/ML_configuration/_metrics.py,sha256=xKtEKzphtidwwU8UuUpGv4B8Y6Bv0tAOjEFUYfz8Ehc,23758
|
|
34
34
|
ml_tools/ML_configuration/_models.py,sha256=lvuuqvD6DWUzOa3i06NZfrdfOi9bu2e26T_QO6BGMSw,7629
|
|
35
35
|
ml_tools/ML_configuration/_training.py,sha256=_M_TwouHFNbGrZQtQNAvyG_poSVpmN99cbyUonZsHhk,8969
|
|
36
36
|
ml_tools/ML_datasetmaster/__init__.py,sha256=UltQzuXnlXVCkD-aeA5TW4IcMVLnQf1_aglawg4WyrI,580
|
|
@@ -39,7 +39,7 @@ ml_tools/ML_datasetmaster/_datasetmaster.py,sha256=Oy2UE3YJpKTaFwQF5TkQLgLB54-BF
|
|
|
39
39
|
ml_tools/ML_datasetmaster/_sequence_datasetmaster.py,sha256=cW3fuILZWs-7Yuo4T2fgGfTC4vwho3Gp4ohIKJYS7O0,18452
|
|
40
40
|
ml_tools/ML_datasetmaster/_vision_datasetmaster.py,sha256=kvSqXYeNBN1JSRfSEEXYeIcsqy9HsJAl_EwFWClqlsw,67025
|
|
41
41
|
ml_tools/ML_evaluation/__init__.py,sha256=e3c8JNP0tt4Kxc7QSQpGcOgrxf8JAucH4UkJvJxUL2E,1122
|
|
42
|
-
ml_tools/ML_evaluation/_classification.py,sha256=
|
|
42
|
+
ml_tools/ML_evaluation/_classification.py,sha256=8bKQejKrgMipnxU1T12ted7p60xvJS0d0MvHtdNBCBM,30971
|
|
43
43
|
ml_tools/ML_evaluation/_feature_importance.py,sha256=mTwi3LKom_axu6UFKunELj30APDdhG9GQC2w7I9mYhI,17137
|
|
44
44
|
ml_tools/ML_evaluation/_loss.py,sha256=1a4O25i3Ya_3naNZNL7ELLUL46BY86g1scA7d7q2UFM,3625
|
|
45
45
|
ml_tools/ML_evaluation/_regression.py,sha256=hnT2B2_6AnQ7aA7uk-X2lZL9G5JFGCduDXyZbr1gFCA,11037
|
|
@@ -118,7 +118,7 @@ ml_tools/ensemble_learning/_ensemble_learning.py,sha256=MHDZBR20_nStlSSeThFI3bSu
|
|
|
118
118
|
ml_tools/excel_handler/__init__.py,sha256=AaWM3n_dqBhJLTs3OEA57ex5YykKXNOwVCyHlVsdnqI,530
|
|
119
119
|
ml_tools/excel_handler/_excel_handler.py,sha256=TODudmeQgDSdxUKzLfAzizs--VL-g8WxDOfQ4sgxxLs,13965
|
|
120
120
|
ml_tools/keys/__init__.py,sha256=-0c2pmrhyfROc-oQpEjJGLBMhSagA3CyFijQaaqZRqU,399
|
|
121
|
-
ml_tools/keys/_keys.py,sha256=
|
|
121
|
+
ml_tools/keys/_keys.py,sha256=lL9NlijxOEAhfDPPqK_wL3QhjalrYK_fWM-KNniSIOA,9308
|
|
122
122
|
ml_tools/math_utilities/__init__.py,sha256=K7Obkkc4rPKj4EbRZf1BsXHfiCg7FXYv_aN9Yc2Z_Vg,400
|
|
123
123
|
ml_tools/math_utilities/_math_utilities.py,sha256=BYHIVcM9tuKIhVrkgLLiM5QalJ39zx7dXYy_M9aGgiM,9012
|
|
124
124
|
ml_tools/optimization_tools/__init__.py,sha256=KD8JXpfGuPndO4AHnjJGu6uV1GRwhOfboD0KZV45kzw,658
|
|
@@ -134,10 +134,11 @@ ml_tools/schema/_feature_schema.py,sha256=MuPf6Nf7tDhUTGyX7tcFHZh-lLSNsJkLmlf9Ix
|
|
|
134
134
|
ml_tools/schema/_gui_schema.py,sha256=IVwN4THAdFrvh2TpV4SFd_zlzMX3eioF-w-qcSVTndE,7245
|
|
135
135
|
ml_tools/serde/__init__.py,sha256=IDirr8i-qjUHB71hmHO6lGiODhUoOnUcXYrvb_XgrzE,292
|
|
136
136
|
ml_tools/serde/_serde.py,sha256=8QnYK8ZG21zdNaC0v63iSz2bhgwOKRKAWxTVQvMV0A8,5525
|
|
137
|
-
ml_tools/utilities/__init__.py,sha256=
|
|
137
|
+
ml_tools/utilities/__init__.py,sha256=h4lE3SQstg-opcQj6QSKhu-HkqSbmHExsWoM9vC5D9U,1035
|
|
138
|
+
ml_tools/utilities/_translate.py,sha256=t5Z7s9X3KTHn-jpe49yRdhYkzAfYzzU4EsIJiUdRnEk,10296
|
|
138
139
|
ml_tools/utilities/_utility_save_load.py,sha256=EFvFaTaHahDQWdJWZr-j7cHqRbG_Xrpc96228JhV-bs,16773
|
|
139
140
|
ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
|
|
140
|
-
dragon_ml_toolbox-20.
|
|
141
|
-
dragon_ml_toolbox-20.
|
|
142
|
-
dragon_ml_toolbox-20.
|
|
143
|
-
dragon_ml_toolbox-20.
|
|
141
|
+
dragon_ml_toolbox-20.7.0.dist-info/METADATA,sha256=MfguicRfdmedIMRUMM6qVIelIr56Mrqdjv4dvTPhB6Y,7866
|
|
142
|
+
dragon_ml_toolbox-20.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
143
|
+
dragon_ml_toolbox-20.7.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
144
|
+
dragon_ml_toolbox-20.7.0.dist-info/RECORD,,
|
|
@@ -10,7 +10,8 @@ from ._dragon_cleaner import (
|
|
|
10
10
|
)
|
|
11
11
|
|
|
12
12
|
from ._clean_tools import (
|
|
13
|
-
save_unique_values
|
|
13
|
+
save_unique_values,
|
|
14
|
+
save_category_counts,
|
|
14
15
|
)
|
|
15
16
|
|
|
16
17
|
from .._core import _imprimir_disponibles
|
|
@@ -20,6 +21,7 @@ __all__ = [
|
|
|
20
21
|
"DragonColumnCleaner",
|
|
21
22
|
"DragonDataFrameCleaner",
|
|
22
23
|
"save_unique_values",
|
|
24
|
+
"save_category_counts",
|
|
23
25
|
"basic_clean",
|
|
24
26
|
"basic_clean_drop",
|
|
25
27
|
"drop_macro_polars",
|
|
@@ -13,6 +13,7 @@ _LOGGER = get_logger("ETL Clean Tools")
|
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
15
|
"save_unique_values",
|
|
16
|
+
"save_category_counts",
|
|
16
17
|
]
|
|
17
18
|
|
|
18
19
|
|
|
@@ -126,3 +127,111 @@ def save_unique_values(csv_path_or_df: Union[str, Path, pl.DataFrame],
|
|
|
126
127
|
counter += 1
|
|
127
128
|
|
|
128
129
|
_LOGGER.info(f"{counter} files of unique values created.")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
################ Category Counts per column #################
|
|
133
|
+
def save_category_counts(csv_path_or_df: Union[str, Path, pl.DataFrame],
|
|
134
|
+
output_dir: Union[str, Path],
|
|
135
|
+
use_columns: Optional[list[str]] = None,
|
|
136
|
+
verbose: bool = False,
|
|
137
|
+
keep_column_order: bool = True) -> None:
|
|
138
|
+
"""
|
|
139
|
+
Calculates the frequency and percentage of each unique value in the specified columns
|
|
140
|
+
and saves the distribution report to a text file.
|
|
141
|
+
|
|
142
|
+
Useful for checking class balance or identifying rare categories.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
csv_path_or_df (str | Path | pl.DataFrame):
|
|
146
|
+
The file path to the input CSV file or a Polars DataFrame.
|
|
147
|
+
output_dir (str | Path):
|
|
148
|
+
The directory where the report files will be saved.
|
|
149
|
+
use_columns (List[str] | None):
|
|
150
|
+
Columns to analyze. If None, all columns are processed.
|
|
151
|
+
verbose (bool):
|
|
152
|
+
If True, prints progress info.
|
|
153
|
+
keep_column_order (bool):
|
|
154
|
+
If True, prepends a numeric prefix to filenames to maintain order.
|
|
155
|
+
"""
|
|
156
|
+
# 1. Handle Input
|
|
157
|
+
if isinstance(csv_path_or_df, pl.DataFrame):
|
|
158
|
+
df = csv_path_or_df
|
|
159
|
+
if use_columns:
|
|
160
|
+
valid_cols = [c for c in use_columns if c in df.columns]
|
|
161
|
+
if not valid_cols:
|
|
162
|
+
_LOGGER.error("None of the specified columns in 'use_columns' exist in the provided DataFrame.")
|
|
163
|
+
raise ValueError()
|
|
164
|
+
df = df.select(valid_cols)
|
|
165
|
+
else:
|
|
166
|
+
csv_path = make_fullpath(input_path=csv_path_or_df, enforce="file")
|
|
167
|
+
df = load_dataframe(df_path=csv_path, use_columns=use_columns, kind="polars", all_strings=True)[0]
|
|
168
|
+
|
|
169
|
+
output_path = make_fullpath(input_path=output_dir, make=True, enforce='directory')
|
|
170
|
+
total_rows = df.height
|
|
171
|
+
|
|
172
|
+
if total_rows == 0:
|
|
173
|
+
_LOGGER.warning("Input DataFrame is empty. No counts to save.")
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
counter = 0
|
|
177
|
+
|
|
178
|
+
# 2. Process Each Column
|
|
179
|
+
for i, col_name in enumerate(df.columns):
|
|
180
|
+
try:
|
|
181
|
+
# Group by, count, and calculate percentage
|
|
182
|
+
# We treat nulls as a category here to see missing data frequency
|
|
183
|
+
stats = (
|
|
184
|
+
df.select(pl.col(col_name))
|
|
185
|
+
.group_by(col_name, maintain_order=False)
|
|
186
|
+
.len(name="count")
|
|
187
|
+
.with_columns(
|
|
188
|
+
(pl.col("count") / total_rows * 100).alias("pct")
|
|
189
|
+
)
|
|
190
|
+
.sort("count", descending=True)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Collect to python list of dicts for writing
|
|
194
|
+
rows = stats.iter_rows(named=True)
|
|
195
|
+
unique_count = stats.height
|
|
196
|
+
|
|
197
|
+
# Check thresholds for warning
|
|
198
|
+
is_high_cardinality = (unique_count > 300) or ((unique_count / total_rows) > 0.5)
|
|
199
|
+
|
|
200
|
+
except Exception:
|
|
201
|
+
_LOGGER.error(f"Could not calculate counts for column '{col_name}'.")
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
# 3. Write to File
|
|
205
|
+
sanitized_name = sanitize_filename(col_name)
|
|
206
|
+
if not sanitized_name.strip('_'):
|
|
207
|
+
sanitized_name = f'column_{i}'
|
|
208
|
+
|
|
209
|
+
prefix = f"{i + 1}_" if keep_column_order else ''
|
|
210
|
+
file_path = output_path / f"{prefix}{sanitized_name}_counts.txt"
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
214
|
+
f.write(f"# Distribution for column: '{col_name}'\n")
|
|
215
|
+
f.write(f"# Total Rows: {total_rows} | Unique Values: {unique_count}\n")
|
|
216
|
+
|
|
217
|
+
if is_high_cardinality:
|
|
218
|
+
f.write(f"# WARNING: High cardinality detected (Unique/Total ratio: {unique_count/total_rows:.2%}).\n")
|
|
219
|
+
|
|
220
|
+
f.write("-" * 65 + "\n")
|
|
221
|
+
f.write(f"{'Count':<10} | {'Percentage':<12} | {'Value'}\n")
|
|
222
|
+
f.write("-" * 65 + "\n")
|
|
223
|
+
|
|
224
|
+
for row in rows:
|
|
225
|
+
val = str(row[col_name])
|
|
226
|
+
count = row["count"]
|
|
227
|
+
pct = row["pct"]
|
|
228
|
+
f.write(f"{count:<10} | {pct:>10.2f}% | {val}\n")
|
|
229
|
+
|
|
230
|
+
except IOError:
|
|
231
|
+
_LOGGER.exception(f"Error writing to file {file_path}.")
|
|
232
|
+
else:
|
|
233
|
+
if verbose:
|
|
234
|
+
print(f" Saved distribution for '{col_name}'.")
|
|
235
|
+
counter += 1
|
|
236
|
+
|
|
237
|
+
_LOGGER.info(f"{counter} distribution files created.")
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import polars as pl
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Union
|
|
3
|
+
from typing import Union, Optional
|
|
4
4
|
|
|
5
5
|
from ..utilities import save_dataframe_filename, load_dataframe
|
|
6
6
|
|
|
7
7
|
from .._core import get_logger
|
|
8
8
|
from ..path_manager import make_fullpath
|
|
9
9
|
|
|
10
|
-
from ._clean_tools import save_unique_values
|
|
10
|
+
from ._clean_tools import save_unique_values, save_category_counts
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
_LOGGER = get_logger("DragonCleaner")
|
|
@@ -33,12 +33,18 @@ class DragonColumnCleaner:
|
|
|
33
33
|
"""
|
|
34
34
|
def __init__(self,
|
|
35
35
|
column_name: str,
|
|
36
|
-
|
|
36
|
+
exact_matches: Optional[Union[dict[str, Union[str, None]], dict[str, str]]] = None,
|
|
37
|
+
rules: Optional[Union[dict[str, Union[str, None]], dict[str, str]]] = None,
|
|
37
38
|
case_insensitive: bool = False):
|
|
38
39
|
"""
|
|
39
40
|
Args:
|
|
40
41
|
column_name (str):
|
|
41
42
|
The name of the column to be cleaned.
|
|
43
|
+
exact_matches (Dict[str, str | None]):
|
|
44
|
+
A dictionary of EXACT string matches to replacement strings.
|
|
45
|
+
- Uses a hash map, which is significantly faster than regex.
|
|
46
|
+
- Used for simple 1-to-1 mappings (e.g., {'Aluminum': 'Al'}).
|
|
47
|
+
- Runs BEFORE the regex rules.
|
|
42
48
|
rules (Dict[str, str | None]):
|
|
43
49
|
A dictionary of regex patterns to replacement strings.
|
|
44
50
|
- Replacement can be None to indicate that matching values should be converted to null.
|
|
@@ -61,25 +67,47 @@ class DragonColumnCleaner:
|
|
|
61
67
|
if not isinstance(column_name, str) or not column_name:
|
|
62
68
|
_LOGGER.error("The 'column_name' must be a non-empty string.")
|
|
63
69
|
raise TypeError()
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
if not isinstance(pattern, str):
|
|
70
|
-
_LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
|
|
70
|
+
|
|
71
|
+
# Validate Regex Rules
|
|
72
|
+
if rules is not None:
|
|
73
|
+
if not isinstance(rules, dict):
|
|
74
|
+
_LOGGER.error("The 'rules' argument must be a dictionary.")
|
|
71
75
|
raise TypeError()
|
|
72
|
-
|
|
73
|
-
|
|
76
|
+
for pattern, replacement in rules.items():
|
|
77
|
+
if not isinstance(pattern, str):
|
|
78
|
+
_LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
|
|
79
|
+
raise TypeError()
|
|
80
|
+
if replacement is not None and not isinstance(replacement, str):
|
|
81
|
+
_LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
|
|
82
|
+
raise TypeError()
|
|
83
|
+
|
|
84
|
+
# Validate Exact Matches
|
|
85
|
+
if exact_matches is not None:
|
|
86
|
+
if not isinstance(exact_matches, dict):
|
|
87
|
+
_LOGGER.error("The 'exact_matches' argument must be a dictionary.")
|
|
74
88
|
raise TypeError()
|
|
89
|
+
for key, val in exact_matches.items():
|
|
90
|
+
if not isinstance(key, str):
|
|
91
|
+
_LOGGER.error("All keys in 'exact_matches' must be strings.")
|
|
92
|
+
raise TypeError()
|
|
93
|
+
if val is not None and not isinstance(val, str):
|
|
94
|
+
_LOGGER.error("All values in 'exact_matches' must be strings or None.")
|
|
95
|
+
raise TypeError()
|
|
96
|
+
|
|
97
|
+
# Raise if both are None or empty
|
|
98
|
+
if not rules and not exact_matches:
|
|
99
|
+
_LOGGER.error("At least one of 'rules' or 'exact_matches' must be provided.")
|
|
100
|
+
raise ValueError()
|
|
75
101
|
|
|
76
102
|
self.column_name = column_name
|
|
77
|
-
self.rules = rules
|
|
103
|
+
self.rules = rules if rules else {}
|
|
104
|
+
self.exact_matches = exact_matches if exact_matches else {}
|
|
78
105
|
self.case_insensitive = case_insensitive
|
|
79
106
|
|
|
80
107
|
def preview(self,
|
|
81
108
|
csv_path: Union[str, Path],
|
|
82
109
|
report_dir: Union[str, Path],
|
|
110
|
+
show_distribution: bool = True,
|
|
83
111
|
add_value_separator: bool=False,
|
|
84
112
|
rule_batch_size: int = 150):
|
|
85
113
|
"""
|
|
@@ -90,6 +118,8 @@ class DragonColumnCleaner:
|
|
|
90
118
|
The path to the CSV file containing the data to clean.
|
|
91
119
|
report_dir (str | Path):
|
|
92
120
|
The directory where the preview report will be saved.
|
|
121
|
+
show_distribution (bool):
|
|
122
|
+
If True, generates a category count report for the column after cleaning.
|
|
93
123
|
add_value_separator (bool):
|
|
94
124
|
If True, adds a separator line between each unique value in the report.
|
|
95
125
|
rule_batch_size (int):
|
|
@@ -101,13 +131,21 @@ class DragonColumnCleaner:
|
|
|
101
131
|
preview_cleaner = DragonDataFrameCleaner(cleaners=[self])
|
|
102
132
|
df_preview = preview_cleaner.clean(df, rule_batch_size=rule_batch_size)
|
|
103
133
|
|
|
104
|
-
# Apply cleaning rules
|
|
134
|
+
# Apply cleaning rules and save reports
|
|
105
135
|
save_unique_values(csv_path_or_df=df_preview,
|
|
106
136
|
output_dir=report_dir,
|
|
107
137
|
use_columns=[self.column_name],
|
|
108
138
|
verbose=False,
|
|
109
139
|
keep_column_order=False,
|
|
110
140
|
add_value_separator=add_value_separator)
|
|
141
|
+
|
|
142
|
+
# Optionally save category counts
|
|
143
|
+
if show_distribution:
|
|
144
|
+
save_category_counts(csv_path_or_df=df_preview,
|
|
145
|
+
output_dir=report_dir,
|
|
146
|
+
use_columns=[self.column_name],
|
|
147
|
+
verbose=False,
|
|
148
|
+
keep_column_order=False)
|
|
111
149
|
|
|
112
150
|
|
|
113
151
|
class DragonDataFrameCleaner:
|
|
@@ -181,16 +219,23 @@ class DragonDataFrameCleaner:
|
|
|
181
219
|
for cleaner in self.cleaners:
|
|
182
220
|
col_name = cleaner.column_name
|
|
183
221
|
|
|
184
|
-
#
|
|
222
|
+
# Start expression for this batch
|
|
223
|
+
col_expr = pl.col(col_name).cast(pl.String)
|
|
224
|
+
|
|
225
|
+
# --- PHASE 1: EXACT MATCHES ---
|
|
226
|
+
# Apply dictionary-based replacement first (faster than regex)
|
|
227
|
+
if cleaner.exact_matches:
|
|
228
|
+
# 'replace' handles dictionary mapping safely. If value is mapped to None, it becomes null.
|
|
229
|
+
col_expr = col_expr.replace(cleaner.exact_matches)
|
|
230
|
+
|
|
231
|
+
# --- PHASE 2: REGEX PATTERNS ---
|
|
185
232
|
all_rules = list(cleaner.rules.items())
|
|
186
233
|
|
|
187
234
|
# Process in batches of 'rule_batch_size'
|
|
188
235
|
for i in range(0, len(all_rules), rule_batch_size):
|
|
189
236
|
rule_batch = all_rules[i : i + rule_batch_size]
|
|
190
237
|
|
|
191
|
-
#
|
|
192
|
-
col_expr = pl.col(col_name).cast(pl.String)
|
|
193
|
-
|
|
238
|
+
# continue chaining operations on the same col_expr
|
|
194
239
|
for pattern, replacement in rule_batch:
|
|
195
240
|
final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
|
|
196
241
|
|
|
@@ -202,6 +247,15 @@ class DragonDataFrameCleaner:
|
|
|
202
247
|
col_expr = col_expr.str.replace_all(final_pattern, replacement)
|
|
203
248
|
|
|
204
249
|
# Apply this batch of rules to the LazyFrame
|
|
250
|
+
# apply partially here to keep the logical plan size under control
|
|
251
|
+
final_lf = final_lf.with_columns(col_expr.alias(col_name))
|
|
252
|
+
|
|
253
|
+
# Reset col_expr for the next batch, but pointing to the 'new' column
|
|
254
|
+
# This ensures the next batch works on the result of the previous batch
|
|
255
|
+
col_expr = pl.col(col_name)
|
|
256
|
+
|
|
257
|
+
# If we had exact matches but NO regex rules, we still need to apply the expression once
|
|
258
|
+
if cleaner.exact_matches and not all_rules:
|
|
205
259
|
final_lf = final_lf.with_columns(col_expr.alias(col_name))
|
|
206
260
|
|
|
207
261
|
# 3. Collect Results
|
|
@@ -242,4 +296,3 @@ class DragonDataFrameCleaner:
|
|
|
242
296
|
save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
|
|
243
297
|
|
|
244
298
|
return None
|
|
245
|
-
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Union
|
|
1
|
+
from typing import Union, Literal
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
__all__ = [
|
|
@@ -26,7 +26,7 @@ class _BaseClassificationFormat:
|
|
|
26
26
|
def __init__(self,
|
|
27
27
|
cmap: str="BuGn",
|
|
28
28
|
ROC_PR_line: str='darkorange',
|
|
29
|
-
calibration_bins: int=
|
|
29
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
30
30
|
xtick_size: int=22,
|
|
31
31
|
ytick_size: int=22,
|
|
32
32
|
legend_size: int=26,
|
|
@@ -46,8 +46,8 @@ class _BaseClassificationFormat:
|
|
|
46
46
|
- Common color names: 'darkorange', 'cornflowerblue', 'crimson', 'forestgreen'
|
|
47
47
|
- Hex codes: '#FF6347', '#4682B4'
|
|
48
48
|
|
|
49
|
-
calibration_bins (int): The number of bins to use when
|
|
50
|
-
|
|
49
|
+
calibration_bins (int | 'auto'): The number of bins to use when creating the calibration (reliability) plot. If 'auto', the number will be dynamically determined based on the number of samples.
|
|
50
|
+
- Typical int values: 10, 15, 20
|
|
51
51
|
|
|
52
52
|
font_size (int): The base font size to apply to the plots.
|
|
53
53
|
|
|
@@ -97,6 +97,7 @@ class _BaseMultiLabelFormat:
|
|
|
97
97
|
def __init__(self,
|
|
98
98
|
cmap: str = "BuGn",
|
|
99
99
|
ROC_PR_line: str='darkorange',
|
|
100
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
100
101
|
font_size: int = 25,
|
|
101
102
|
xtick_size: int=20,
|
|
102
103
|
ytick_size: int=20,
|
|
@@ -115,6 +116,9 @@ class _BaseMultiLabelFormat:
|
|
|
115
116
|
- Common color names: 'darkorange', 'cornflowerblue', 'crimson', 'forestgreen'
|
|
116
117
|
- Hex codes: '#FF6347', '#4682B4'
|
|
117
118
|
|
|
119
|
+
calibration_bins (int | 'auto'): The number of bins to use when creating the calibration (reliability) plots for each label. If 'auto', the number will be dynamically determined based on the number of samples.
|
|
120
|
+
- Typical int values: 10, 15, 20
|
|
121
|
+
|
|
118
122
|
font_size (int): The base font size to apply to the plots.
|
|
119
123
|
|
|
120
124
|
xtick_size (int): Font size for x-axis tick labels.
|
|
@@ -133,6 +137,7 @@ class _BaseMultiLabelFormat:
|
|
|
133
137
|
"""
|
|
134
138
|
self.cmap = cmap
|
|
135
139
|
self.ROC_PR_line = ROC_PR_line
|
|
140
|
+
self.calibration_bins = calibration_bins
|
|
136
141
|
self.font_size = font_size
|
|
137
142
|
self.xtick_size = xtick_size
|
|
138
143
|
self.ytick_size = ytick_size
|
|
@@ -142,6 +147,7 @@ class _BaseMultiLabelFormat:
|
|
|
142
147
|
parts = [
|
|
143
148
|
f"cmap='{self.cmap}'",
|
|
144
149
|
f"ROC_PR_line='{self.ROC_PR_line}'",
|
|
150
|
+
f"calibration_bins={self.calibration_bins}",
|
|
145
151
|
f"font_size={self.font_size}",
|
|
146
152
|
f"xtick_size={self.xtick_size}",
|
|
147
153
|
f"ytick_size={self.ytick_size}",
|
|
@@ -416,7 +422,7 @@ class FormatBinaryClassificationMetrics(_BaseClassificationFormat):
|
|
|
416
422
|
def __init__(self,
|
|
417
423
|
cmap: str="BuGn",
|
|
418
424
|
ROC_PR_line: str='darkorange',
|
|
419
|
-
calibration_bins: int=
|
|
425
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
420
426
|
font_size: int=26,
|
|
421
427
|
xtick_size: int=22,
|
|
422
428
|
ytick_size: int=22,
|
|
@@ -440,7 +446,7 @@ class FormatMultiClassClassificationMetrics(_BaseClassificationFormat):
|
|
|
440
446
|
def __init__(self,
|
|
441
447
|
cmap: str="BuGn",
|
|
442
448
|
ROC_PR_line: str='darkorange',
|
|
443
|
-
calibration_bins: int=
|
|
449
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
444
450
|
font_size: int=26,
|
|
445
451
|
xtick_size: int=22,
|
|
446
452
|
ytick_size: int=22,
|
|
@@ -464,7 +470,7 @@ class FormatBinaryImageClassificationMetrics(_BaseClassificationFormat):
|
|
|
464
470
|
def __init__(self,
|
|
465
471
|
cmap: str="BuGn",
|
|
466
472
|
ROC_PR_line: str='darkorange',
|
|
467
|
-
calibration_bins: int=
|
|
473
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
468
474
|
font_size: int=26,
|
|
469
475
|
xtick_size: int=22,
|
|
470
476
|
ytick_size: int=22,
|
|
@@ -488,7 +494,7 @@ class FormatMultiClassImageClassificationMetrics(_BaseClassificationFormat):
|
|
|
488
494
|
def __init__(self,
|
|
489
495
|
cmap: str="BuGn",
|
|
490
496
|
ROC_PR_line: str='darkorange',
|
|
491
|
-
calibration_bins: int=
|
|
497
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
492
498
|
font_size: int=26,
|
|
493
499
|
xtick_size: int=22,
|
|
494
500
|
ytick_size: int=22,
|
|
@@ -513,6 +519,7 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
|
|
|
513
519
|
def __init__(self,
|
|
514
520
|
cmap: str = "BuGn",
|
|
515
521
|
ROC_PR_line: str='darkorange',
|
|
522
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
516
523
|
font_size: int = 25,
|
|
517
524
|
xtick_size: int=20,
|
|
518
525
|
ytick_size: int=20,
|
|
@@ -520,6 +527,7 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
|
|
|
520
527
|
) -> None:
|
|
521
528
|
super().__init__(cmap=cmap,
|
|
522
529
|
ROC_PR_line=ROC_PR_line,
|
|
530
|
+
calibration_bins=calibration_bins,
|
|
523
531
|
font_size=font_size,
|
|
524
532
|
xtick_size=xtick_size,
|
|
525
533
|
ytick_size=ytick_size,
|
|
@@ -2,7 +2,7 @@ import numpy as np
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import matplotlib.pyplot as plt
|
|
4
4
|
import seaborn as sns
|
|
5
|
-
from sklearn.calibration import
|
|
5
|
+
from sklearn.calibration import calibration_curve
|
|
6
6
|
from sklearn.metrics import (
|
|
7
7
|
classification_report,
|
|
8
8
|
ConfusionMatrixDisplay,
|
|
@@ -378,42 +378,42 @@ def classification_metrics(save_dir: Union[str, Path],
|
|
|
378
378
|
|
|
379
379
|
# --- Save Calibration Plot ---
|
|
380
380
|
fig_cal, ax_cal = plt.subplots(figsize=CLASSIFICATION_PLOT_SIZE, dpi=DPI_value)
|
|
381
|
+
|
|
382
|
+
user_chosen_bins = format_config.calibration_bins
|
|
383
|
+
|
|
384
|
+
# --- Automate Bin Selection ---
|
|
385
|
+
if not isinstance(user_chosen_bins, int) or user_chosen_bins <= 0:
|
|
386
|
+
# Determine bins based on number of samples
|
|
387
|
+
n_samples = y_true.shape[0]
|
|
388
|
+
if n_samples < 200:
|
|
389
|
+
dynamic_bins = 5
|
|
390
|
+
elif n_samples < 1000:
|
|
391
|
+
dynamic_bins = 10
|
|
392
|
+
else:
|
|
393
|
+
dynamic_bins = 15
|
|
394
|
+
else:
|
|
395
|
+
dynamic_bins = user_chosen_bins
|
|
396
|
+
|
|
397
|
+
# --- Step 1: Get binned data directly ---
|
|
398
|
+
# calculates reliability diagram data without needing a temporary plot
|
|
399
|
+
prob_true, prob_pred = calibration_curve(y_true_binary, y_score, n_bins=dynamic_bins)
|
|
381
400
|
|
|
382
|
-
# --- Step
|
|
383
|
-
with plt.ioff(): # Suppress showing the temporary plot
|
|
384
|
-
fig_temp, ax_temp = plt.subplots()
|
|
385
|
-
cal_display_temp = CalibrationDisplay.from_predictions(
|
|
386
|
-
y_true_binary, # Use binarized labels
|
|
387
|
-
y_score,
|
|
388
|
-
n_bins=format_config.calibration_bins,
|
|
389
|
-
ax=ax_temp,
|
|
390
|
-
name="temp" # Add a name to suppress potential warnings
|
|
391
|
-
)
|
|
392
|
-
# Get the x, y coordinates of the binned data
|
|
393
|
-
line_x, line_y = cal_display_temp.line_.get_data() # type: ignore
|
|
394
|
-
plt.close(fig_temp) # Close the temporary plot
|
|
395
|
-
|
|
396
|
-
# --- Step 2: Build the plot from scratch ---
|
|
401
|
+
# --- Step 2: Plot ---
|
|
397
402
|
ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
|
|
398
403
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
'color': format_config.ROC_PR_line,
|
|
407
|
-
'linestyle': '--',
|
|
408
|
-
'linewidth': 2,
|
|
409
|
-
}
|
|
410
|
-
)
|
|
404
|
+
# Plot the actual calibration curve (connect points with a line)
|
|
405
|
+
ax_cal.plot(prob_pred,
|
|
406
|
+
prob_true,
|
|
407
|
+
marker='o', # Add markers to see bin locations
|
|
408
|
+
linewidth=2,
|
|
409
|
+
label="Model calibration",
|
|
410
|
+
color=format_config.ROC_PR_line)
|
|
411
411
|
|
|
412
412
|
ax_cal.set_title(f'Reliability Curve{plot_title}', pad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size + 2)
|
|
413
413
|
ax_cal.set_xlabel('Mean Predicted Probability', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size)
|
|
414
414
|
ax_cal.set_ylabel('Fraction of Positives', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size)
|
|
415
415
|
|
|
416
|
-
# --- Step 3: Set final limits
|
|
416
|
+
# --- Step 3: Set final limits ---
|
|
417
417
|
ax_cal.set_ylim(0.0, 1.0)
|
|
418
418
|
ax_cal.set_xlim(0.0, 1.0)
|
|
419
419
|
|
|
@@ -428,7 +428,7 @@ def classification_metrics(save_dir: Union[str, Path],
|
|
|
428
428
|
cal_path = save_dir_path / f"calibration_plot{save_suffix}.svg"
|
|
429
429
|
plt.savefig(cal_path)
|
|
430
430
|
plt.close(fig_cal)
|
|
431
|
-
|
|
431
|
+
|
|
432
432
|
_LOGGER.info(f"📈 Saved {len(class_indices_to_plot)} sets of ROC, Precision-Recall, and Calibration plots.")
|
|
433
433
|
|
|
434
434
|
|
|
@@ -632,6 +632,52 @@ def multi_label_classification_metrics(
|
|
|
632
632
|
pr_path = save_dir_path / f"pr_curve_{sanitized_name}.svg"
|
|
633
633
|
plt.savefig(pr_path)
|
|
634
634
|
plt.close(fig_pr)
|
|
635
|
+
|
|
636
|
+
# --- Save Calibration Plot (New Feature) ---
|
|
637
|
+
fig_cal, ax_cal = plt.subplots(figsize=CLASSIFICATION_PLOT_SIZE, dpi=DPI_value)
|
|
638
|
+
|
|
639
|
+
user_chosen_bins = format_config.calibration_bins
|
|
640
|
+
|
|
641
|
+
# --- Automate Bin Selection ---
|
|
642
|
+
if not isinstance(user_chosen_bins, int) or user_chosen_bins <= 0:
|
|
643
|
+
# Determine bins based on number of samples
|
|
644
|
+
n_samples = y_true.shape[0]
|
|
645
|
+
if n_samples < 200:
|
|
646
|
+
dynamic_bins = 5
|
|
647
|
+
elif n_samples < 1000:
|
|
648
|
+
dynamic_bins = 10
|
|
649
|
+
else:
|
|
650
|
+
dynamic_bins = 15
|
|
651
|
+
else:
|
|
652
|
+
dynamic_bins = user_chosen_bins
|
|
653
|
+
|
|
654
|
+
# Calculate calibration curve for this specific label
|
|
655
|
+
prob_true, prob_pred = calibration_curve(true_i, prob_i, n_bins=dynamic_bins)
|
|
656
|
+
|
|
657
|
+
ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
|
|
658
|
+
ax_cal.plot(prob_pred,
|
|
659
|
+
prob_true,
|
|
660
|
+
marker='o',
|
|
661
|
+
linewidth=2,
|
|
662
|
+
label=f"Calibration for '{name}'",
|
|
663
|
+
color=format_config.ROC_PR_line)
|
|
664
|
+
|
|
665
|
+
ax_cal.set_title(f'Reliability Curve for "{name}"', pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
|
|
666
|
+
ax_cal.set_xlabel('Mean Predicted Probability', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
|
|
667
|
+
ax_cal.set_ylabel('Fraction of Positives', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
|
|
668
|
+
|
|
669
|
+
ax_cal.set_ylim(0.0, 1.0)
|
|
670
|
+
ax_cal.set_xlim(0.0, 1.0)
|
|
671
|
+
|
|
672
|
+
ax_cal.tick_params(axis='x', labelsize=xtick_size)
|
|
673
|
+
ax_cal.tick_params(axis='y', labelsize=ytick_size)
|
|
674
|
+
ax_cal.legend(loc='lower right', fontsize=legend_size)
|
|
675
|
+
ax_cal.grid(True)
|
|
676
|
+
|
|
677
|
+
plt.tight_layout()
|
|
678
|
+
cal_path = save_dir_path / f"calibration_plot_{sanitized_name}.svg"
|
|
679
|
+
plt.savefig(cal_path)
|
|
680
|
+
plt.close(fig_cal)
|
|
635
681
|
|
|
636
682
|
_LOGGER.info(f"All individual label reports and plots saved to '{save_dir_path.name}'")
|
|
637
683
|
|
ml_tools/keys/_keys.py
CHANGED
ml_tools/utilities/__init__.py
CHANGED
|
@@ -15,6 +15,13 @@ from ._utility_tools import (
|
|
|
15
15
|
train_dataset_yielder
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
+
from ._translate import (
|
|
19
|
+
translate_dataframe_columns,
|
|
20
|
+
create_translation_template,
|
|
21
|
+
audit_column_translation
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
18
25
|
from .._core import _imprimir_disponibles
|
|
19
26
|
|
|
20
27
|
|
|
@@ -27,6 +34,9 @@ __all__ = [
|
|
|
27
34
|
"save_dataframe",
|
|
28
35
|
"save_dataframe_with_schema",
|
|
29
36
|
"merge_dataframes",
|
|
37
|
+
"translate_dataframe_columns",
|
|
38
|
+
"create_translation_template",
|
|
39
|
+
"audit_column_translation",
|
|
30
40
|
"distribute_dataset_by_target",
|
|
31
41
|
"train_dataset_orchestrator",
|
|
32
42
|
"train_dataset_yielder"
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import polars as pl
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Union, Literal
|
|
6
|
+
|
|
7
|
+
from ..path_manager import make_fullpath
|
|
8
|
+
from .._core import get_logger
|
|
9
|
+
|
|
10
|
+
from ._utility_save_load import load_dataframe
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_LOGGER = get_logger("Translation Tools")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"translate_dataframe_columns",
|
|
18
|
+
"create_translation_template",
|
|
19
|
+
"audit_column_translation"
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def translate_dataframe_columns(
|
|
24
|
+
df: Union[pd.DataFrame, pl.DataFrame],
|
|
25
|
+
mapper: Union[dict[str, str], str, Path],
|
|
26
|
+
direction: Literal["A_to_B", "B_to_A"] = "A_to_B",
|
|
27
|
+
verbose: int = 3
|
|
28
|
+
) -> Union[pd.DataFrame, pl.DataFrame]:
|
|
29
|
+
"""
|
|
30
|
+
Translates the column names of a DataFrame (Pandas or Polars) using a provided mapping source.
|
|
31
|
+
|
|
32
|
+
The mapping can be a python dictionary, a JSON file, or a CSV file.
|
|
33
|
+
|
|
34
|
+
Translation Logic:
|
|
35
|
+
-----------------
|
|
36
|
+
The DataFrame currently has columns in 'Language A'.
|
|
37
|
+
|
|
38
|
+
- "A_to_B" (Standard):
|
|
39
|
+
The mapper is structured as {Language A : Language B}.
|
|
40
|
+
Keys match the current DataFrame columns.
|
|
41
|
+
|
|
42
|
+
- "B_to_A" (Inverted Source):
|
|
43
|
+
The mapper is structured as {Language B : Language A}.
|
|
44
|
+
Values match the current DataFrame columns.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
df : (pd.DataFrame | pl.DataFrame)
|
|
49
|
+
The input DataFrame to be translated.
|
|
50
|
+
mapper : (dict[str, str] | str | Path)
|
|
51
|
+
The source of the translation mapping:
|
|
52
|
+
- Dict: {'original_name': 'new_name'}
|
|
53
|
+
- JSON path: File containing a single JSON object (dict).
|
|
54
|
+
- CSV path: File with two columns.
|
|
55
|
+
direction : Literal["A_to_B", "B_to_A"]
|
|
56
|
+
Specifies the structure of the provided mapper relative to the DataFrame.
|
|
57
|
+
verbose : int
|
|
58
|
+
Whether to log warnings and information about the process.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
Dataframe:
|
|
63
|
+
The polars or pandas DataFrame with renamed columns.
|
|
64
|
+
"""
|
|
65
|
+
# df type validation
|
|
66
|
+
if not isinstance(df, (pd.DataFrame, pl.DataFrame)):
|
|
67
|
+
_LOGGER.error(f"Input df must be a pandas or polars DataFrame. Got: {type(df)}")
|
|
68
|
+
raise TypeError()
|
|
69
|
+
|
|
70
|
+
# 1. Load and Standardize the Mapping
|
|
71
|
+
translation_map = _load_translation_mapping(mapper, direction)
|
|
72
|
+
|
|
73
|
+
# 2. Validation: Check intersection between DF columns and Map keys
|
|
74
|
+
df_cols = set(df.columns)
|
|
75
|
+
map_keys = set(translation_map.keys())
|
|
76
|
+
|
|
77
|
+
# Calculate overlap
|
|
78
|
+
common_cols = df_cols.intersection(map_keys)
|
|
79
|
+
|
|
80
|
+
if not common_cols:
|
|
81
|
+
if verbose >= 1:
|
|
82
|
+
_LOGGER.warning("No column names matched the provided translation mapping. Returning original DataFrame.")
|
|
83
|
+
return df
|
|
84
|
+
|
|
85
|
+
missing_in_map = df_cols - map_keys
|
|
86
|
+
if missing_in_map and verbose >= 1:
|
|
87
|
+
_LOGGER.warning(f"Columns not found in translation map: {list(missing_in_map)}")
|
|
88
|
+
|
|
89
|
+
if verbose >= 3:
|
|
90
|
+
_LOGGER.info(f"Translating {len(common_cols)} columns...")
|
|
91
|
+
|
|
92
|
+
# 3. Apply Translation
|
|
93
|
+
try:
|
|
94
|
+
if isinstance(df, pd.DataFrame):
|
|
95
|
+
return df.rename(columns=translation_map)
|
|
96
|
+
elif isinstance(df, pl.DataFrame):
|
|
97
|
+
return df.rename(translation_map)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
_LOGGER.error(f"Failed to rename columns: {e}")
|
|
100
|
+
raise e
|
|
101
|
+
|
|
102
|
+
if verbose >= 2:
|
|
103
|
+
_LOGGER.info(f"Successfully translated {len(common_cols)} columns.")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def create_translation_template(
|
|
107
|
+
df_or_path: Union[pd.DataFrame, pl.DataFrame, str, Path],
|
|
108
|
+
save_path: Union[str, Path],
|
|
109
|
+
verbose: bool = True
|
|
110
|
+
) -> None:
|
|
111
|
+
"""
|
|
112
|
+
Generates a JSON translation template from a DataFrame's column names.
|
|
113
|
+
|
|
114
|
+
Creates a 'translation_template.json' file where keys are the dataframe column names and values
|
|
115
|
+
are empty strings, ready for manual translation.
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
df_or_path : [DataFrame | str | Path]
|
|
120
|
+
The DataFrame or path to a CSV file to extract column names from.
|
|
121
|
+
save_path : [str | Path]
|
|
122
|
+
The destination directory for the .json template.
|
|
123
|
+
"""
|
|
124
|
+
# 1. Get Columns
|
|
125
|
+
if isinstance(df_or_path, (str, Path)):
|
|
126
|
+
df, _ = load_dataframe(df_or_path, kind="pandas", verbose=False)
|
|
127
|
+
columns = df.columns.tolist()
|
|
128
|
+
elif isinstance(df_or_path, pd.DataFrame):
|
|
129
|
+
columns = df_or_path.columns.tolist()
|
|
130
|
+
elif isinstance(df_or_path, pl.DataFrame):
|
|
131
|
+
columns = df_or_path.columns
|
|
132
|
+
else:
|
|
133
|
+
_LOGGER.error("Input must be a DataFrame or a path to a dataset.")
|
|
134
|
+
raise TypeError()
|
|
135
|
+
|
|
136
|
+
# 2. Create Dictionary {ColName : ""}
|
|
137
|
+
template_dict = {col: "" for col in columns}
|
|
138
|
+
|
|
139
|
+
# 3. Save to JSON
|
|
140
|
+
out_path = make_fullpath(save_path, enforce="directory")
|
|
141
|
+
full_out_path = out_path / "translation_template.json"
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
with open(full_out_path, 'w', encoding='utf-8') as f:
|
|
145
|
+
json.dump(template_dict, f, indent=4, ensure_ascii=False)
|
|
146
|
+
|
|
147
|
+
if verbose:
|
|
148
|
+
_LOGGER.info(f"Translation template created at '{out_path.name}' with {len(columns)} entries.")
|
|
149
|
+
except Exception as e:
|
|
150
|
+
_LOGGER.error(f"Failed to save template: {e}")
|
|
151
|
+
raise e
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def audit_column_translation(
|
|
155
|
+
df_or_path: Union[pd.DataFrame, pl.DataFrame, str, Path],
|
|
156
|
+
mapper: Union[dict[str, str], str, Path],
|
|
157
|
+
direction: Literal["A_to_B", "B_to_A"] = "A_to_B"
|
|
158
|
+
) -> None:
|
|
159
|
+
"""
|
|
160
|
+
Audits the coverage of a translation map against a DataFrame WITHOUT applying changes.
|
|
161
|
+
|
|
162
|
+
Logs a detailed report of:
|
|
163
|
+
- How many columns will be renamed.
|
|
164
|
+
- Which DataFrame columns are NOT in the map (will remain unchanged).
|
|
165
|
+
- Which Map keys are NOT in the DataFrame (unused mappings).
|
|
166
|
+
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
df_or_path : [DataFrame | str | Path]
|
|
170
|
+
The target dataset to audit.
|
|
171
|
+
mapper : [Dict | str | Path]
|
|
172
|
+
The translation source.
|
|
173
|
+
direction : ["A_to_B" | "B_to_A"]
|
|
174
|
+
Direction logic (see translate_dataframe_columns).
|
|
175
|
+
"""
|
|
176
|
+
# 1. Get DataFrame Columns
|
|
177
|
+
if isinstance(df_or_path, (str, Path)):
|
|
178
|
+
df, df_name = load_dataframe(df_or_path, kind="pandas", verbose=False)
|
|
179
|
+
cols = set(df.columns)
|
|
180
|
+
source_name = f"File: '{df_name}'"
|
|
181
|
+
elif isinstance(df_or_path, pd.DataFrame):
|
|
182
|
+
cols = set(df_or_path.columns)
|
|
183
|
+
source_name = "DataFrame (Pandas)"
|
|
184
|
+
elif isinstance(df_or_path, pl.DataFrame):
|
|
185
|
+
cols = set(df_or_path.columns)
|
|
186
|
+
source_name = "DataFrame (Polars)"
|
|
187
|
+
else:
|
|
188
|
+
_LOGGER.error("Input must be a DataFrame or a path to a dataset.")
|
|
189
|
+
raise TypeError()
|
|
190
|
+
|
|
191
|
+
# 2. Load Map
|
|
192
|
+
try:
|
|
193
|
+
trans_map = _load_translation_mapping(mapper, direction)
|
|
194
|
+
map_keys = set(trans_map.keys())
|
|
195
|
+
except Exception as e:
|
|
196
|
+
_LOGGER.error(f"Could not load mapper. {e}")
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
# 3. Analyze Sets
|
|
200
|
+
matched = cols.intersection(map_keys)
|
|
201
|
+
missing_in_map = cols - map_keys
|
|
202
|
+
unused_map_keys = map_keys - cols
|
|
203
|
+
|
|
204
|
+
coverage_pct = (len(matched) / len(cols) * 100) if len(cols) > 0 else 0.0
|
|
205
|
+
|
|
206
|
+
# 4. Report
|
|
207
|
+
report_string = f"--- 🔍 Translation Audit Report: {source_name} ---\n \
|
|
208
|
+
Direction: {direction}\n \
|
|
209
|
+
Total Columns: {len(cols)}\n \
|
|
210
|
+
Map Coverage: {len(matched)} / {len(cols)} ({coverage_pct:.1f}%)\n"
|
|
211
|
+
|
|
212
|
+
if matched:
|
|
213
|
+
report_string += f"\n✅ Will Translate: {len(matched)} columns"
|
|
214
|
+
|
|
215
|
+
if missing_in_map:
|
|
216
|
+
report_string += f"\n⚠️ Not in Map: {len(missing_in_map)} columns: {list(missing_in_map)}"
|
|
217
|
+
|
|
218
|
+
if unused_map_keys:
|
|
219
|
+
report_string += f"\n➡️ Unused Map Keys: {len(unused_map_keys)}"
|
|
220
|
+
|
|
221
|
+
_LOGGER.info(report_string)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _load_translation_mapping(
|
|
225
|
+
source: Union[dict[str, str], str, Path],
|
|
226
|
+
direction: Literal["A_to_B", "B_to_A"]
|
|
227
|
+
) -> dict[str, str]:
|
|
228
|
+
"""
|
|
229
|
+
Internal helper to load mapping from Dict, JSON, or CSV and handle direction inversion.
|
|
230
|
+
"""
|
|
231
|
+
raw_map: dict[str, str] = {}
|
|
232
|
+
|
|
233
|
+
# --- Load Source ---
|
|
234
|
+
if isinstance(source, dict):
|
|
235
|
+
raw_map = source.copy()
|
|
236
|
+
|
|
237
|
+
elif isinstance(source, (str, Path)):
|
|
238
|
+
path = make_fullpath(source, enforce="file")
|
|
239
|
+
|
|
240
|
+
if path.suffix.lower() == ".json":
|
|
241
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
242
|
+
content = json.load(f)
|
|
243
|
+
if not isinstance(content, dict):
|
|
244
|
+
_LOGGER.error(f"JSON file '{path.name}' does not contain a dictionary.")
|
|
245
|
+
raise ValueError()
|
|
246
|
+
raw_map = content
|
|
247
|
+
|
|
248
|
+
elif path.suffix.lower() == ".csv":
|
|
249
|
+
# Load CSV using pandas for robustness
|
|
250
|
+
try:
|
|
251
|
+
df_map = pd.read_csv(path)
|
|
252
|
+
|
|
253
|
+
# STRICT VALIDATION: Must be exactly 2 columns
|
|
254
|
+
if df_map.shape[1] != 2:
|
|
255
|
+
_LOGGER.error(f"CSV file '{path.name}' must have exactly 2 columns for mapping. Found {df_map.shape[1]}.")
|
|
256
|
+
raise ValueError()
|
|
257
|
+
|
|
258
|
+
key_col = df_map.columns[0]
|
|
259
|
+
val_col = df_map.columns[1]
|
|
260
|
+
|
|
261
|
+
# Convert to dictionary (drop NaNs to be safe)
|
|
262
|
+
raw_map = df_map.dropna(subset=[key_col, val_col]).set_index(key_col)[val_col].to_dict()
|
|
263
|
+
|
|
264
|
+
except Exception as e:
|
|
265
|
+
_LOGGER.error(f"Error reading CSV mapping file: {e}")
|
|
266
|
+
raise e
|
|
267
|
+
else:
|
|
268
|
+
_LOGGER.error(f"Unsupported file extension for mapping source: {path.suffix}")
|
|
269
|
+
raise ValueError()
|
|
270
|
+
else:
|
|
271
|
+
_LOGGER.error("Mapper must be a Dictionary, or a Path/String to a JSON/CSV file.")
|
|
272
|
+
raise TypeError()
|
|
273
|
+
|
|
274
|
+
# --- Handle Direction ---
|
|
275
|
+
# Case: The mapper is A->B, and DF is A. (Keys match DF). Return as is.
|
|
276
|
+
if direction == "A_to_B":
|
|
277
|
+
return raw_map
|
|
278
|
+
|
|
279
|
+
# Case: The mapper is B->A, but DF is A. (Values match DF).
|
|
280
|
+
# swap the mapper to A->B so the Keys match the DF.
|
|
281
|
+
elif direction == "B_to_A":
|
|
282
|
+
# Inversion requires unique values to be lossless
|
|
283
|
+
reversed_map = {v: k for k, v in raw_map.items()}
|
|
284
|
+
|
|
285
|
+
if len(reversed_map) < len(raw_map):
|
|
286
|
+
_LOGGER.warning("Direction 'B_to_A' resulted in fewer keys than original. Duplicate target values existed in the source map; some collisions were overwritten.")
|
|
287
|
+
|
|
288
|
+
return reversed_map
|
|
289
|
+
|
|
290
|
+
else:
|
|
291
|
+
_LOGGER.error("Direction must be 'A_to_B' or 'B_to_A'.")
|
|
292
|
+
raise ValueError()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|