PyPI - dragon-ml-toolbox - Versions diffs - 20.13.0__py3-none-any.whl → 20.14.1__py3-none-any.whl - Mend

dragon-ml-toolbox 20.13.0py3-none-any.whl → 20.14.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

{dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 20.13.0
+Version: 20.14.1
 Summary: Complete pipelines and helper tools for data science and machine learning projects.
 Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.1.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-dragon_ml_toolbox-20.13.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
-dragon_ml_toolbox-20.13.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
+dragon_ml_toolbox-20.14.1.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-20.14.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
 ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
 ml_tools/ETL_cleaning/__init__.py,sha256=gLRHF-qzwpqKTvbbn9chIQELeUDh_XGpBRX28j-5IqI,545
@@ -80,7 +80,7 @@ ml_tools/ML_optimization/__init__.py,sha256=No18Dsw6Q9zPt8B9fpG0bWomuXmwDC7Dioki
 ml_tools/ML_optimization/_multi_dragon.py,sha256=zQhDxFY8FNxUlcbSnHMVArfojzYjgNa21jSE3pJmRW0,38956
 ml_tools/ML_optimization/_single_dragon.py,sha256=jh5-SK6NKAzbheQhquiYoROozk-RzUv1jiFkIzK_AFg,7288
 ml_tools/ML_optimization/_single_manual.py,sha256=h-_k9JmRqPkjTra1nu7AyYbSyWkYZ1R3utiNmW06WFs,21809
-ml_tools/ML_scaler/_ML_scaler.py,sha256=P75X0Sx8N-VxC2Qy8aG7mWaZlkTfjspiZDi1YiMQD1I,8872
+ml_tools/ML_scaler/_ML_scaler.py,sha256=NcwprqrAHMIKpkzMdExk99I2QpfTSbiJH8rDqmOlnkU,8870
 ml_tools/ML_scaler/__init__.py,sha256=SHDNyLsoOLl2OtkIb3pGg-JRs3E2bYJBgnHwH3vw_Tk,172
 ml_tools/ML_trainer/__init__.py,sha256=42kueHa7Z0b_yLbywNCgIxlW6WmgLBqkTFwKH7vFLXw,379
 ml_tools/ML_trainer/_base_trainer.py,sha256=0ATm672NRsjJ6nv_NEl6-OEd9Bst1-s5OPxfG4qe8Lg,18075
@@ -104,11 +104,11 @@ ml_tools/_core/__init__.py,sha256=m-VP0RW0tOTm9N5NI3kFNcpM7WtVgs0RK9pK3ZJRZQQ,14
 ml_tools/_core/_logger.py,sha256=xzhn_FouMDRVNwXGBGlPC9Ruq6i5uCrmNaS5jesguMU,4972
 ml_tools/_core/_schema_load_ops.py,sha256=KLs9vBzANz5ESe2wlP-C41N4VlgGil-ywcfvWKSOGss,1551
 ml_tools/_core/_script_info.py,sha256=LtFGt10gEvCnhIRMKJPi2yXkiGLcdr7lE-oIP2XGHzQ,234
-ml_tools/data_exploration/__init__.py,sha256=efUBsruHL56B429tUadl3PdG73zAF639Y430uMQRfko,1917
-ml_tools/data_exploration/_analysis.py,sha256=PJNrEBz5ZZXHoUlQ6fh9Y86nzPQrLpVPv2Ye4NfOxgs,14181
+ml_tools/data_exploration/__init__.py,sha256=XNA8gcRx5ifrv092HA7HSpek8havlk_3RZi9aq9dSjg,1957
+ml_tools/data_exploration/_analysis.py,sha256=JSoFJSkv4-_v9YxxmjHZ_PeFRneDENjSEo2sy_uC4oY,14196
 ml_tools/data_exploration/_cleaning.py,sha256=pAZOXgGK35j7O8q6cnyTwYK1GLNnD04A8p2fSyMB1mg,20906
-ml_tools/data_exploration/_features.py,sha256=Z1noJfDxBzFRfusFp6NlpLF2NItuZuzFHq4ssWFqny4,26273
-ml_tools/data_exploration/_plotting.py,sha256=zH1dPcIoAlOuww23xIoBCsQOAshPPv9OyGposOA2RvI,19883
+ml_tools/data_exploration/_features.py,sha256=_aBMW7eqSm6oUj54ftidsv9zdywOkc1eyZgITb82XF8,29237
+ml_tools/data_exploration/_plotting.py,sha256=Vg9qS46akbAyrZAgBrPWg2p29V5vqqY4Bk4SHwZLZNI,19995
 ml_tools/data_exploration/_schema_ops.py,sha256=Fd6fBGGv4OpxmJ1HG9pith6QL90z0tzssCvzkQxlEEQ,11083
 ml_tools/ensemble_evaluation/__init__.py,sha256=t4Gr8EGEk8RLatyc92-S0BzbQvdvodzoF-qDAH2qjVg,546
 ml_tools/ensemble_evaluation/_ensemble_evaluation.py,sha256=-sX9cLMaa0FOQDikmVv2lsCYtQ56Kftd3tILnNej0Hg,28346
@@ -143,7 +143,7 @@ ml_tools/utilities/__init__.py,sha256=h4lE3SQstg-opcQj6QSKhu-HkqSbmHExsWoM9vC5D9
 ml_tools/utilities/_translate.py,sha256=U8hRPa3PmTpIf9n9yR3gBGmp_hkcsjQLwjAHSHc0WHs,10325
 ml_tools/utilities/_utility_save_load.py,sha256=EFvFaTaHahDQWdJWZr-j7cHqRbG_Xrpc96228JhV-bs,16773
 ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
-dragon_ml_toolbox-20.13.0.dist-info/METADATA,sha256=bTnTpMlvOFu2IlYpmc0QphbYeqbslxzptluUbEWaO-s,7889
-dragon_ml_toolbox-20.13.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-dragon_ml_toolbox-20.13.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-20.13.0.dist-info/RECORD,,
+dragon_ml_toolbox-20.14.1.dist-info/METADATA,sha256=oV6v5gFhRVLpuJ3HgL7Qpn8_Dgk9DGkYcOjSfl2kIh0,7889
+dragon_ml_toolbox-20.14.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+dragon_ml_toolbox-20.14.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-20.14.1.dist-info/RECORD,,

ml_tools/ML_scaler/_ML_scaler.py CHANGED Viewed

@@ -99,7 +99,7 @@ class DragonScaler:
             std = torch.sqrt(torch.clamp(variance, min=1e-8))
         if verbose >= 2:
-            _LOGGER.info(f"Scaler fitted on {n_total} samples for {num_continuous_features} features (Welford's).")
+            _LOGGER.info(f"Scaler fitted on {n_total} samples for {num_continuous_features} columns (Welford's).")
         return cls(mean=mean_global, std=std, continuous_feature_indices=continuous_feature_indices)
     @classmethod
@@ -121,7 +121,7 @@ class DragonScaler:
         std = torch.where(std == 0, torch.tensor(1.0, device=data.device), std)
         if verbose >= 2:
-            _LOGGER.info(f"Scaler fitted on tensor with {data.shape[0]} samples for {num_features} features.")
+            _LOGGER.info(f"Scaler fitted on tensor with {data.shape[0]} samples for {num_features} columns.")
         return cls(mean=mean, std=std, continuous_feature_indices=indices)

ml_tools/data_exploration/__init__.py CHANGED Viewed

@@ -33,6 +33,7 @@ from ._features import (
     reconstruct_one_hot,
     reconstruct_binary,
     reconstruct_multibinary,
+    filter_subset,
 )
 from ._schema_ops import (
@@ -51,6 +52,7 @@ __all__ = [
     "drop_columns_with_missing_data",
     "drop_macro",
     "clean_column_names",
+    "filter_subset",
     "plot_value_distributions",
     "split_features_targets",
     "split_continuous_binary",

ml_tools/data_exploration/_analysis.py CHANGED Viewed

@@ -34,7 +34,7 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
     """
     summary = pd.DataFrame({
         'Data Type': df.dtypes,
-        'Completeness %': (df.notnull().mean() * 100).round(2),
+        'Completeness %': (df.notnull().mean() * 100).round(2), # type: ignore
         'Unique Values': df.nunique(),
         # 'Missing %': (df.isnull().mean() * 100).round(2)
     })

ml_tools/data_exploration/_features.py CHANGED Viewed

@@ -168,6 +168,13 @@ def split_continuous_categorical_targets(
         f"  - Categorical: {df_categorical.shape}\n"
         f"  - Targets: {df_targets.shape}"
     )
+    if isinstance(df_continuous, pd.Series):
+        df_continuous = df_continuous.to_frame()
+    if isinstance(df_categorical, pd.Series):
+        df_categorical = df_categorical.to_frame()
+    if isinstance(df_targets, pd.Series):
+        df_targets = df_targets.to_frame()
     return df_continuous, df_categorical, df_targets
@@ -271,6 +278,7 @@ def encode_classification_target(
     df: pd.DataFrame,
     target_col: str,
     save_dir: Union[str, Path],
+    suffix: str = "",
     verbose: int = 2
 ) -> tuple[pd.DataFrame, dict[str, int]]:
     """
@@ -283,6 +291,7 @@ def encode_classification_target(
         df (pd.DataFrame): Input DataFrame.
         target_col (str): Name of the target column to encode.
         save_dir (str | Path): Directory where the class map JSON will be saved.
+        suffix (str): Suffix to append to the class map filename.
         verbose (int): Verbosity level for logging.
     Returns:
@@ -300,9 +309,17 @@ def encode_classification_target(
         _LOGGER.error(f"Target column '{target_col}' contains {n_missing} missing values. Please handle them before encoding.")
         raise ValueError()
+    # validate suffix and prepend underscore if needed
+    if suffix:
+        if not suffix.startswith("_"):
+            suffix = f"_{suffix}"
+        sanitized_suffix = suffix
+    else:
+        sanitized_suffix = ''
     # Ensure directory exists
     save_path = make_fullpath(save_dir, make=True, enforce="directory")
-    file_path = save_path / "class_map.json"
+    file_path = save_path / f"class_map{sanitized_suffix}.json"
     # Get unique values and sort them to ensure deterministic encoding (0, 1, 2...)
     # Convert to string to ensure the keys in JSON are strings
@@ -322,10 +339,9 @@ def encode_classification_target(
             json.dump(class_map, f, indent=4)
         if verbose >= 2:
-            _LOGGER.info(f"Class mapping saved to: '{file_path}'")
+            _LOGGER.info(f"Target '{target_col}' encoded with {len(class_map)} classes. Saved to {file_path}.")
         if verbose >= 3:
-            _LOGGER.info(f"Target '{target_col}' encoded with {len(class_map)} classes.")
             # Print a preview
             if len(class_map) <= 10:
                 print(f"  Mapping: {class_map}")
@@ -657,3 +673,66 @@ def reconstruct_multibinary(
         _LOGGER.info(f"Reconstructed {converted_count} binary columns matching '{pattern}'.")
     return new_df, target_columns
+def filter_subset(
+    df: pd.DataFrame,
+    filters: Union[dict[str, Any], dict[str, list[Any]]],
+    drop_filter_cols: bool = True,
+    reset_index: bool = True,
+    verbose: int = 3
+) -> pd.DataFrame:
+    """
+    Filters the DataFrame based on a dictionary of column-value conditions.
+    Supports:
+    - Single value matching (e.g., {"Color": "Blue"})
+    - Multiple value matching (e.g., {"Color": ["Blue", "Red"]}) -> OR logic within column.
+    - Multiple column filtering (e.g., {"Color": "Blue", "Size": "Large"}) -> AND logic between columns.
+    Args:
+        df (pd.DataFrame): Input DataFrame.
+        filters (dict[str, Any] | dict[str, list[Any]]): Dictionary where keys are column names and values are the target values (scalar or list).
+        drop_filter_cols (bool): If True, drops the columns used for filtering from the result.
+        reset_index (bool): If True, resets the index of the resulting DataFrame.
+        verbose (int): Verbosity level.
+    Returns:
+        pd.DataFrame: The filtered DataFrame.
+    """
+    df_filtered = df.copy()
+    # Validate columns exist
+    missing_cols = [col for col in filters.keys() if col not in df.columns]
+    if missing_cols:
+        _LOGGER.error(f"Filter columns not found: {missing_cols}")
+        raise ValueError()
+    if verbose >= 2:
+        _LOGGER.info(f"Original shape: {df.shape}")
+    for col, value in filters.items():
+        # Handle list of values (OR logic within column)
+        if isinstance(value, list):
+            df_filtered = df_filtered[df_filtered[col].isin(value)]
+        # Handle single value
+        else:
+            # Warn if the value is a floating point due to potential precision issues
+            if isinstance(value, float) and verbose >= 1:
+                _LOGGER.warning(f"Filtering on column '{col}' with float value '{value}'.")
+            df_filtered = df_filtered[df_filtered[col] == value]
+    if drop_filter_cols:
+        if verbose >= 3:
+            _LOGGER.info(f"Dropping filter columns: {list(filters.keys())}")
+        df_filtered.drop(columns=list(filters.keys()), inplace=True)
+    if reset_index:
+        if verbose >= 3:
+            _LOGGER.info("Resetting index of the filtered DataFrame.")
+        df_filtered.reset_index(drop=True, inplace=True)
+    if verbose >= 2:
+        _LOGGER.info(f"Filtered shape: {df_filtered.shape}")
+    return df_filtered

ml_tools/data_exploration/_plotting.py CHANGED Viewed

@@ -475,6 +475,9 @@ def plot_correlation_heatmap(df: pd.DataFrame,
         save_path = make_fullpath(save_dir, make=True)
         # sanitize the plot title to save the file
         sanitized_plot_title = sanitize_filename(plot_title)
+        # prepend method to filename
+        sanitized_plot_title = f"{method}_{sanitized_plot_title}"
         plot_filename = sanitized_plot_title + ".svg"
         full_path = save_path / plot_filename

{dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 20.13.0__py3-none-any.whl → 20.14.1__py3-none-any.whl

dragon-ml-toolbox 20.13.0py3-none-any.whl → 20.14.1py3-none-any.whl