dragon-ml-toolbox 20.6.0__py3-none-any.whl → 20.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 20.6.0
3
+ Version: 20.7.0
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- dragon_ml_toolbox-20.6.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-20.6.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
1
+ dragon_ml_toolbox-20.7.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-20.7.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
3
3
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
5
5
  ml_tools/ETL_cleaning/__init__.py,sha256=gLRHF-qzwpqKTvbbn9chIQELeUDh_XGpBRX28j-5IqI,545
@@ -134,10 +134,11 @@ ml_tools/schema/_feature_schema.py,sha256=MuPf6Nf7tDhUTGyX7tcFHZh-lLSNsJkLmlf9Ix
134
134
  ml_tools/schema/_gui_schema.py,sha256=IVwN4THAdFrvh2TpV4SFd_zlzMX3eioF-w-qcSVTndE,7245
135
135
  ml_tools/serde/__init__.py,sha256=IDirr8i-qjUHB71hmHO6lGiODhUoOnUcXYrvb_XgrzE,292
136
136
  ml_tools/serde/_serde.py,sha256=8QnYK8ZG21zdNaC0v63iSz2bhgwOKRKAWxTVQvMV0A8,5525
137
- ml_tools/utilities/__init__.py,sha256=iQb-S5JesEjGGI8983Vkj-14LCtchFxdWRhaziyvnoY,808
137
+ ml_tools/utilities/__init__.py,sha256=h4lE3SQstg-opcQj6QSKhu-HkqSbmHExsWoM9vC5D9U,1035
138
+ ml_tools/utilities/_translate.py,sha256=t5Z7s9X3KTHn-jpe49yRdhYkzAfYzzU4EsIJiUdRnEk,10296
138
139
  ml_tools/utilities/_utility_save_load.py,sha256=EFvFaTaHahDQWdJWZr-j7cHqRbG_Xrpc96228JhV-bs,16773
139
140
  ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
140
- dragon_ml_toolbox-20.6.0.dist-info/METADATA,sha256=HfSazpvNdCk-0TW27NgJuerpBdsrzGhmmUnO3g1FMe4,7866
141
- dragon_ml_toolbox-20.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
142
- dragon_ml_toolbox-20.6.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
143
- dragon_ml_toolbox-20.6.0.dist-info/RECORD,,
141
+ dragon_ml_toolbox-20.7.0.dist-info/METADATA,sha256=MfguicRfdmedIMRUMM6qVIelIr56Mrqdjv4dvTPhB6Y,7866
142
+ dragon_ml_toolbox-20.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
143
+ dragon_ml_toolbox-20.7.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
144
+ dragon_ml_toolbox-20.7.0.dist-info/RECORD,,
@@ -15,6 +15,13 @@ from ._utility_tools import (
15
15
  train_dataset_yielder
16
16
  )
17
17
 
18
+ from ._translate import (
19
+ translate_dataframe_columns,
20
+ create_translation_template,
21
+ audit_column_translation
22
+ )
23
+
24
+
18
25
  from .._core import _imprimir_disponibles
19
26
 
20
27
 
@@ -27,6 +34,9 @@ __all__ = [
27
34
  "save_dataframe",
28
35
  "save_dataframe_with_schema",
29
36
  "merge_dataframes",
37
+ "translate_dataframe_columns",
38
+ "create_translation_template",
39
+ "audit_column_translation",
30
40
  "distribute_dataset_by_target",
31
41
  "train_dataset_orchestrator",
32
42
  "train_dataset_yielder"
@@ -0,0 +1,292 @@
1
+ import json
2
+ import pandas as pd
3
+ import polars as pl
4
+ from pathlib import Path
5
+ from typing import Union, Literal
6
+
7
+ from ..path_manager import make_fullpath
8
+ from .._core import get_logger
9
+
10
+ from ._utility_save_load import load_dataframe
11
+
12
+
13
+ _LOGGER = get_logger("Translation Tools")
14
+
15
+
16
+ __all__ = [
17
+ "translate_dataframe_columns",
18
+ "create_translation_template",
19
+ "audit_column_translation"
20
+ ]
21
+
22
+
23
+ def translate_dataframe_columns(
24
+ df: Union[pd.DataFrame, pl.DataFrame],
25
+ mapper: Union[dict[str, str], str, Path],
26
+ direction: Literal["A_to_B", "B_to_A"] = "A_to_B",
27
+ verbose: int = 3
28
+ ) -> Union[pd.DataFrame, pl.DataFrame]:
29
+ """
30
+ Translates the column names of a DataFrame (Pandas or Polars) using a provided mapping source.
31
+
32
+ The mapping can be a python dictionary, a JSON file, or a CSV file.
33
+
34
+ Translation Logic:
35
+ -----------------
36
+ The DataFrame currently has columns in 'Language A'.
37
+
38
+ - "A_to_B" (Standard):
39
+ The mapper is structured as {Language A : Language B}.
40
+ Keys match the current DataFrame columns.
41
+
42
+ - "B_to_A" (Inverted Source):
43
+ The mapper is structured as {Language B : Language A}.
44
+ Values match the current DataFrame columns.
45
+
46
+ Parameters
47
+ ----------
48
+ df : (pd.DataFrame | pl.DataFrame)
49
+ The input DataFrame to be translated.
50
+ mapper : (dict[str, str] | str | Path)
51
+ The source of the translation mapping:
52
+ - Dict: {'original_name': 'new_name'}
53
+ - JSON path: File containing a single JSON object (dict).
54
+ - CSV path: File with two columns.
55
+ direction : Literal["A_to_B", "B_to_A"]
56
+ Specifies the structure of the provided mapper relative to the DataFrame.
57
+ verbose : int
58
+ Whether to log warnings and information about the process.
59
+
60
+ Returns
61
+ -------
62
+ Dataframe:
63
+ The polars or pandas DataFrame with renamed columns.
64
+ """
65
+ # df type validation
66
+ if not isinstance(df, (pd.DataFrame, pl.DataFrame)):
67
+ _LOGGER.error(f"Input df must be a pandas or polars DataFrame. Got: {type(df)}")
68
+ raise TypeError()
69
+
70
+ # 1. Load and Standardize the Mapping
71
+ translation_map = _load_translation_mapping(mapper, direction)
72
+
73
+ # 2. Validation: Check intersection between DF columns and Map keys
74
+ df_cols = set(df.columns)
75
+ map_keys = set(translation_map.keys())
76
+
77
+ # Calculate overlap
78
+ common_cols = df_cols.intersection(map_keys)
79
+
80
+ if not common_cols:
81
+ if verbose >= 1:
82
+ _LOGGER.warning("No column names matched the provided translation mapping. Returning original DataFrame.")
83
+ return df
84
+
85
+ missing_in_map = df_cols - map_keys
86
+ if missing_in_map and verbose >= 1:
87
+ _LOGGER.warning(f"Columns not found in translation map: {list(missing_in_map)}")
88
+
89
+ if verbose >= 3:
90
+ _LOGGER.info(f"Translating {len(common_cols)} columns...")
91
+
92
+ # 3. Apply Translation
93
+ try:
94
+ if isinstance(df, pd.DataFrame):
95
+ return df.rename(columns=translation_map)
96
+ elif isinstance(df, pl.DataFrame):
97
+ return df.rename(translation_map)
98
+ except Exception as e:
99
+ _LOGGER.error(f"Failed to rename columns: {e}")
100
+ raise e
101
+
102
+ if verbose >= 2:
103
+ _LOGGER.info(f"Successfully translated {len(common_cols)} columns.")
104
+
105
+
106
+ def create_translation_template(
107
+ df_or_path: Union[pd.DataFrame, pl.DataFrame, str, Path],
108
+ save_path: Union[str, Path],
109
+ verbose: bool = True
110
+ ) -> None:
111
+ """
112
+ Generates a JSON translation template from a DataFrame's column names.
113
+
114
+ Creates a 'translation_template.json' file where keys are the dataframe column names and values
115
+ are empty strings, ready for manual translation.
116
+
117
+ Parameters
118
+ ----------
119
+ df_or_path : [DataFrame | str | Path]
120
+ The DataFrame or path to a CSV file to extract column names from.
121
+ save_path : [str | Path]
122
+ The destination directory for the .json template.
123
+ """
124
+ # 1. Get Columns
125
+ if isinstance(df_or_path, (str, Path)):
126
+ df, _ = load_dataframe(df_or_path, kind="pandas", verbose=False)
127
+ columns = df.columns.tolist()
128
+ elif isinstance(df_or_path, pd.DataFrame):
129
+ columns = df_or_path.columns.tolist()
130
+ elif isinstance(df_or_path, pl.DataFrame):
131
+ columns = df_or_path.columns
132
+ else:
133
+ _LOGGER.error("Input must be a DataFrame or a path to a dataset.")
134
+ raise TypeError()
135
+
136
+ # 2. Create Dictionary {ColName : ""}
137
+ template_dict = {col: "" for col in columns}
138
+
139
+ # 3. Save to JSON
140
+ out_path = make_fullpath(save_path, enforce="directory")
141
+ full_out_path = out_path / "translation_template.json"
142
+
143
+ try:
144
+ with open(full_out_path, 'w', encoding='utf-8') as f:
145
+ json.dump(template_dict, f, indent=4, ensure_ascii=False)
146
+
147
+ if verbose:
148
+ _LOGGER.info(f"Translation template created at '{out_path.name}' with {len(columns)} entries.")
149
+ except Exception as e:
150
+ _LOGGER.error(f"Failed to save template: {e}")
151
+ raise e
152
+
153
+
154
+ def audit_column_translation(
155
+ df_or_path: Union[pd.DataFrame, pl.DataFrame, str, Path],
156
+ mapper: Union[dict[str, str], str, Path],
157
+ direction: Literal["A_to_B", "B_to_A"] = "A_to_B"
158
+ ) -> None:
159
+ """
160
+ Audits the coverage of a translation map against a DataFrame WITHOUT applying changes.
161
+
162
+ Logs a detailed report of:
163
+ - How many columns will be renamed.
164
+ - Which DataFrame columns are NOT in the map (will remain unchanged).
165
+ - Which Map keys are NOT in the DataFrame (unused mappings).
166
+
167
+ Parameters
168
+ ----------
169
+ df_or_path : [DataFrame | str | Path]
170
+ The target dataset to audit.
171
+ mapper : [Dict | str | Path]
172
+ The translation source.
173
+ direction : ["A_to_B" | "B_to_A"]
174
+ Direction logic (see translate_dataframe_columns).
175
+ """
176
+ # 1. Get DataFrame Columns
177
+ if isinstance(df_or_path, (str, Path)):
178
+ df, df_name = load_dataframe(df_or_path, kind="pandas", verbose=False)
179
+ cols = set(df.columns)
180
+ source_name = f"File: '{df_name}'"
181
+ elif isinstance(df_or_path, pd.DataFrame):
182
+ cols = set(df_or_path.columns)
183
+ source_name = "DataFrame (Pandas)"
184
+ elif isinstance(df_or_path, pl.DataFrame):
185
+ cols = set(df_or_path.columns)
186
+ source_name = "DataFrame (Polars)"
187
+ else:
188
+ _LOGGER.error("Input must be a DataFrame or a path to a dataset.")
189
+ raise TypeError()
190
+
191
+ # 2. Load Map
192
+ try:
193
+ trans_map = _load_translation_mapping(mapper, direction)
194
+ map_keys = set(trans_map.keys())
195
+ except Exception as e:
196
+ _LOGGER.error(f"Could not load mapper. {e}")
197
+ return
198
+
199
+ # 3. Analyze Sets
200
+ matched = cols.intersection(map_keys)
201
+ missing_in_map = cols - map_keys
202
+ unused_map_keys = map_keys - cols
203
+
204
+ coverage_pct = (len(matched) / len(cols) * 100) if len(cols) > 0 else 0.0
205
+
206
+ # 4. Report
207
+ report_string = f"--- 🔍 Translation Audit Report: {source_name} ---\n \
208
+ Direction: {direction}\n \
209
+ Total Columns: {len(cols)}\n \
210
+ Map Coverage: {len(matched)} / {len(cols)} ({coverage_pct:.1f}%)\n"
211
+
212
+ if matched:
213
+ report_string += f"\n✅ Will Translate: {len(matched)} columns"
214
+
215
+ if missing_in_map:
216
+ report_string += f"\n⚠️ Not in Map: {len(missing_in_map)} columns: {list(missing_in_map)}"
217
+
218
+ if unused_map_keys:
219
+ report_string += f"\n➡️ Unused Map Keys: {len(unused_map_keys)}"
220
+
221
+ _LOGGER.info(report_string)
222
+
223
+
224
+ def _load_translation_mapping(
225
+ source: Union[dict[str, str], str, Path],
226
+ direction: Literal["A_to_B", "B_to_A"]
227
+ ) -> dict[str, str]:
228
+ """
229
+ Internal helper to load mapping from Dict, JSON, or CSV and handle direction inversion.
230
+ """
231
+ raw_map: dict[str, str] = {}
232
+
233
+ # --- Load Source ---
234
+ if isinstance(source, dict):
235
+ raw_map = source.copy()
236
+
237
+ elif isinstance(source, (str, Path)):
238
+ path = make_fullpath(source, enforce="file")
239
+
240
+ if path.suffix.lower() == ".json":
241
+ with open(path, 'r', encoding='utf-8') as f:
242
+ content = json.load(f)
243
+ if not isinstance(content, dict):
244
+ _LOGGER.error(f"JSON file '{path.name}' does not contain a dictionary.")
245
+ raise ValueError()
246
+ raw_map = content
247
+
248
+ elif path.suffix.lower() == ".csv":
249
+ # Load CSV using pandas for robustness
250
+ try:
251
+ df_map = pd.read_csv(path)
252
+
253
+ # STRICT VALIDATION: Must be exactly 2 columns
254
+ if df_map.shape[1] != 2:
255
+ _LOGGER.error(f"CSV file '{path.name}' must have exactly 2 columns for mapping. Found {df_map.shape[1]}.")
256
+ raise ValueError()
257
+
258
+ key_col = df_map.columns[0]
259
+ val_col = df_map.columns[1]
260
+
261
+ # Convert to dictionary (drop NaNs to be safe)
262
+ raw_map = df_map.dropna(subset=[key_col, val_col]).set_index(key_col)[val_col].to_dict()
263
+
264
+ except Exception as e:
265
+ _LOGGER.error(f"Error reading CSV mapping file: {e}")
266
+ raise e
267
+ else:
268
+ _LOGGER.error(f"Unsupported file extension for mapping source: {path.suffix}")
269
+ raise ValueError()
270
+ else:
271
+ _LOGGER.error("Mapper must be a Dictionary, or a Path/String to a JSON/CSV file.")
272
+ raise TypeError()
273
+
274
+ # --- Handle Direction ---
275
+ # Case: The mapper is A->B, and DF is A. (Keys match DF). Return as is.
276
+ if direction == "A_to_B":
277
+ return raw_map
278
+
279
+ # Case: The mapper is B->A, but DF is A. (Values match DF).
280
+ # swap the mapper to A->B so the Keys match the DF.
281
+ elif direction == "B_to_A":
282
+ # Inversion requires unique values to be lossless
283
+ reversed_map = {v: k for k, v in raw_map.items()}
284
+
285
+ if len(reversed_map) < len(raw_map):
286
+ _LOGGER.warning("Direction 'B_to_A' resulted in fewer keys than original. Duplicate target values existed in the source map; some collisions were overwritten.")
287
+
288
+ return reversed_map
289
+
290
+ else:
291
+ _LOGGER.error("Direction must be 'A_to_B' or 'B_to_A'.")
292
+ raise ValueError()