dragon-ml-toolbox 12.1.0__py3-none-any.whl → 12.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 12.1.0
3
+ Version: 12.2.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- dragon_ml_toolbox-12.1.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-12.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
1
+ dragon_ml_toolbox-12.2.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-12.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
3
3
  ml_tools/ETL_cleaning.py,sha256=PLRSR-VYnt1nNT9XrcWq40SE0VzHCw7DQ8v9czfSQsU,20366
4
4
  ml_tools/ETL_engineering.py,sha256=l0I6Og9o4s6EODdk0kZXjbbC-a3vVPYy1FopP2BkQSQ,54909
5
5
  ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
@@ -24,7 +24,7 @@ ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
24
24
  ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
25
25
  ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
26
26
  ml_tools/custom_logger.py,sha256=OZqG7FR_UE6byzY3RDmlj08a336ZU-4DzNBMPLr_d5c,5881
27
- ml_tools/data_exploration.py,sha256=is9P4c4orIKW6gRhTeScZlCGYH9ODguxMtVlrVubb4E,42515
27
+ ml_tools/data_exploration.py,sha256=H-cHp6jL4u4Kl2L_fktcCdQWRdAzTC6kwFCrOHnzLNA,46549
28
28
  ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
29
29
  ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
30
30
  ml_tools/ensemble_learning.py,sha256=aTPeKthO4zRWBEaQJOUj8jEqVHiHjjOMXuiEWjI9NxM,21946
@@ -35,7 +35,7 @@ ml_tools/optimization_tools.py,sha256=bkKrTjukNOpxgVDMW5mUX5vQ72ckBcS5VA4eG8uZsO
35
35
  ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
36
36
  ml_tools/serde.py,sha256=k0qAwfMf13lVBQSgq5u9MSXEoo31iOA2-Ncm8XgMCMI,3974
37
37
  ml_tools/utilities.py,sha256=gef62GLK7ev5BWkkQekeJoVZqwf2mIuOlOfyCw6WdtE,13882
38
- dragon_ml_toolbox-12.1.0.dist-info/METADATA,sha256=PJbBSG9h6juu_srL07VVhgOIGqebQwn_rlI1RgZdTwo,6166
39
- dragon_ml_toolbox-12.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
- dragon_ml_toolbox-12.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
41
- dragon_ml_toolbox-12.1.0.dist-info/RECORD,,
38
+ dragon_ml_toolbox-12.2.0.dist-info/METADATA,sha256=WS3Im1AwRObhKUkNPDkW0xRM8gdrylqavE9svIVRFKY,6166
39
+ dragon_ml_toolbox-12.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
+ dragon_ml_toolbox-12.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
41
+ dragon_ml_toolbox-12.2.0.dist-info/RECORD,,
@@ -3,7 +3,7 @@ from pandas.api.types import is_numeric_dtype
3
3
  import numpy as np
4
4
  import matplotlib.pyplot as plt
5
5
  import seaborn as sns
6
- from typing import Union, Literal, Dict, Tuple, List, Optional
6
+ from typing import Union, Literal, Dict, Tuple, List, Optional, Any
7
7
  from pathlib import Path
8
8
  import re
9
9
 
@@ -33,7 +33,8 @@ __all__ = [
33
33
  "match_and_filter_columns_by_regex",
34
34
  "standardize_percentages",
35
35
  "create_transformer_categorical_map",
36
- "reconstruct_one_hot"
36
+ "reconstruct_one_hot",
37
+ "reconstruct_binary"
37
38
  ]
38
39
 
39
40
 
@@ -1081,7 +1082,110 @@ def reconstruct_one_hot(
1081
1082
  unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
1082
1083
  new_df.drop(columns=unique_cols_to_drop, inplace=True)
1083
1084
  _LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original one-hot encoded columns.")
1084
-
1085
+
1086
+ _LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
1087
+
1088
+ return new_df
1089
+
1090
+
1091
+ def reconstruct_binary(
1092
+ df: pd.DataFrame,
1093
+ reconstruction_map: Dict[str, Tuple[str, Any, Any]],
1094
+ drop_original: bool = True,
1095
+ verbose: bool = True
1096
+ ) -> pd.DataFrame:
1097
+ """
1098
+ Reconstructs new categorical columns from existing binary (0/1) columns.
1099
+
1100
+ Used to reverse a binary encoding by mapping 0 and 1 back to
1101
+ descriptive categorical labels.
1102
+
1103
+ Args:
1104
+ df (pd.DataFrame):
1105
+ The input DataFrame.
1106
+ reconstruction_map (Dict[str, Tuple[str, Any, Any]]):
1107
+ A dictionary defining the reconstructions.
1108
+ Format:
1109
+ { "new_col_name": ("source_col_name", "label_for_0", "label_for_1") }
1110
+ Example:
1111
+ {
1112
+ "Sex": ("Sex_male", "Female", "Male"),
1113
+ "Smoker": ("Is_Smoker", "No", "Yes")
1114
+ }
1115
+ drop_original (bool):
1116
+ If True, the original binary source columns (e.g., "Sex_male")
1117
+ will be dropped from the returned DataFrame.
1118
+ verbose (bool):
1119
+ If True, prints the details of each reconstruction.
1120
+
1121
+ Returns:
1122
+ pd.DataFrame:
1123
+ A new DataFrame with the reconstructed categorical columns.
1124
+
1125
+ Raises:
1126
+ TypeError: If `df` is not a pandas DataFrame.
1127
+ ValueError: If `reconstruction_map` is not a dictionary or a
1128
+ configuration is invalid (e.g., column name collision).
1129
+
1130
+ Notes:
1131
+ - The function operates on a copy of the DataFrame.
1132
+ - Rows with `NaN` in the source column will have `NaN` in the
1133
+ new column.
1134
+ - Values in the source column other than 0 or 1 (e.g., 2) will
1135
+ result in `NaN` in the new column.
1136
+ """
1137
+ if not isinstance(df, pd.DataFrame):
1138
+ _LOGGER.error("Input must be a pandas DataFrame.")
1139
+ raise TypeError()
1140
+
1141
+ if not isinstance(reconstruction_map, dict):
1142
+ _LOGGER.error("`reconstruction_map` must be a dictionary with the required format.")
1143
+ raise ValueError()
1144
+
1145
+ new_df = df.copy()
1146
+ source_cols_to_drop: List[str] = []
1147
+ reconstructed_count = 0
1148
+
1149
+ _LOGGER.info(f"Attempting to reconstruct {len(reconstruction_map)} binary feature(s).")
1150
+
1151
+ for new_col_name, config in reconstruction_map.items():
1152
+
1153
+ # --- 1. Validation ---
1154
+ if not (isinstance(config, tuple) and len(config) == 3):
1155
+ _LOGGER.error(f"Config for '{new_col_name}' is invalid. Must be a 3-item tuple. Skipping.")
1156
+ raise ValueError()
1157
+
1158
+ source_col, label_for_0, label_for_1 = config
1159
+
1160
+ if source_col not in new_df.columns:
1161
+ _LOGGER.error(f"Source column '{source_col}' for new column '{new_col_name}' not found. Skipping.")
1162
+ raise ValueError()
1163
+
1164
+ if new_col_name in new_df.columns and verbose:
1165
+ _LOGGER.warning(f"New column '{new_col_name}' already exists and will be overwritten.")
1166
+
1167
+ if new_col_name == source_col:
1168
+ _LOGGER.error(f"New column name '{new_col_name}' cannot be the same as source column '{source_col}'.")
1169
+ raise ValueError()
1170
+
1171
+ # --- 2. Reconstruction ---
1172
+ # .map() handles 0, 1, preserves NaNs, and converts any other value to NaN.
1173
+ mapping_dict = {0: label_for_0, 1: label_for_1}
1174
+ new_df[new_col_name] = new_df[source_col].map(mapping_dict)
1175
+
1176
+ # --- 3. Logging/Tracking ---
1177
+ source_cols_to_drop.append(source_col)
1178
+ reconstructed_count += 1
1179
+ if verbose:
1180
+ print(f" - Reconstructed '{new_col_name}' from '{source_col}' (0='{label_for_0}', 1='{label_for_1}').")
1181
+
1182
+ # --- 4. Cleanup ---
1183
+ if drop_original and source_cols_to_drop:
1184
+ # Use set() to avoid duplicates if the same source col was used
1185
+ unique_cols_to_drop = list(set(source_cols_to_drop))
1186
+ new_df.drop(columns=unique_cols_to_drop, inplace=True)
1187
+ _LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original binary source column(s).")
1188
+
1085
1189
  _LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
1086
1190
 
1087
1191
  return new_df