PyPI - dragon-ml-toolbox - Versions diffs - 4.2.2__tar.gz → 4.4.0__tar.gz - Mend

dragon-ml-toolbox 4.2.2tar.gz → 4.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (35) hide show

{dragon_ml_toolbox-4.2.2/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-4.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 4.2.2
+Version: 4.4.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 4.2.2
+Version: 4.4.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ETL_engineering.py RENAMED Viewed

@@ -569,7 +569,7 @@ class NumberExtractor:
         self,
         regex_pattern: str = r"(\d+\.?\d*)",
         dtype: Literal["float", "int"] = "float",
-        round_digits: Optional[int] = None,
+        round_digits: Optional[int] = 2,
     ):
         # --- Validation ---
         if not isinstance(regex_pattern, str):

{dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/custom_logger.py RENAMED Viewed

@@ -1,9 +1,9 @@
 from pathlib import Path
 from datetime import datetime
 from typing import Union, List, Dict, Any
-import pandas as pd
 import traceback
 import json
+import csv
 from .path_manager import sanitize_filename, make_fullpath
 from ._script_info import _script_info
 from ._logger import _LOGGER
@@ -18,7 +18,6 @@ def custom_logger(
     data: Union[
         List[Any],
         Dict[Any, Any],
-        pd.DataFrame,
         str,
         BaseException
     ],
@@ -75,7 +74,7 @@ def custom_logger(
             _log_exception_to_log(data, base_path.with_suffix(".log"))
         else:
-            raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
+            raise ValueError("Unsupported data type. Must be list, dict, str, or BaseException.")
         _LOGGER.info(f"🗄️ Log saved to: '{base_path}'")
@@ -106,8 +105,19 @@ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: Path) -> None:
         padded_value = value + [None] * (max_length - len(value))
         sanitized_dict[sanitized_key] = padded_value
-    df = pd.DataFrame(sanitized_dict)
-    df.to_csv(path, index=False)
+    # The `newline=''` argument is important to prevent extra blank rows
+    with open(path, 'w', newline='', encoding='utf-8') as csv_file:
+        writer = csv.writer(csv_file)
+        # 1. Write the header row from the sanitized dictionary keys
+        header = list(sanitized_dict.keys())
+        writer.writerow(header)
+        # 2. Transpose columns to rows and write them
+        # zip(*sanitized_dict.values()) elegantly converts the column data
+        # (lists in the dict) into row-by-row tuples.
+        rows_to_write = zip(*sanitized_dict.values())
+        writer.writerows(rows_to_write)
 def _log_string_to_log(data: str, path: Path) -> None:

{dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/data_exploration.py RENAMED Viewed

@@ -15,9 +15,9 @@ __all__ = [
     "summarize_dataframe",
     "drop_constant_columns",
     "drop_rows_with_missing_data",
-    "split_features_targets",
     "show_null_columns",
     "drop_columns_with_missing_data",
+    "split_features_targets",
     "split_continuous_binary",
     "plot_correlation_heatmap",
     "plot_value_distributions",
@@ -125,17 +125,19 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
     # Stage 1: Drop rows with all target columns missing
     if targets is not None:
-        target_na = df_clean[targets].isnull().all(axis=1)
+        # validate targets
+        valid_targets = _validate_columns(df_clean, targets)
+        target_na = df_clean[valid_targets].isnull().all(axis=1)
         if target_na.any():
             print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
             df_clean = df_clean[~target_na]
         else:
             print("✅ No rows with all targets missing.")
     else:
-        targets = []
+        valid_targets = []
     # Stage 2: Drop rows based on feature column missing values
-    feature_cols = [col for col in df_clean.columns if col not in targets]
+    feature_cols = [col for col in df_clean.columns if col not in valid_targets]
     if feature_cols:
         feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
         rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
@@ -150,30 +152,6 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
     return df_clean
-def split_features_targets(df: pd.DataFrame, targets: list[str]):
-    """
-    Splits a DataFrame's columns into features and targets.
-    Args:
-        df (pd.DataFrame): Pandas DataFrame containing the dataset.
-        targets (list[str]): List of column names to be treated as target variables.
-    Returns:
-        tuple: A tuple containing:
-            - pd.DataFrame: Features dataframe.
-            - pd.DataFrame: Targets dataframe.
-    Prints:
-        - Shape of the original dataframe.
-        - Shape of the features dataframe.
-        - Shape of the targets dataframe.
-    """
-    df_targets = df[targets]
-    df_features = df.drop(columns=targets)
-    print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
-    return df_features, df_targets
 def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
     """
     Displays a table of columns with missing values, showing both the count and
@@ -202,7 +180,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
     return null_summary
-def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
+def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True, skip_columns: Optional[List[str]]=None) -> pd.DataFrame:
     """
     Drops columns with more than `threshold` fraction of missing values.
@@ -210,11 +188,22 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
         df (pd.DataFrame): The input DataFrame.
         threshold (float): Fraction of missing values above which columns are dropped.
         show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
+        skip_columns (list[str] | None): If given, these columns wont be included in the drop process.
     Returns:
         pd.DataFrame: A new DataFrame without the dropped columns.
     """
-    missing_fraction = df.isnull().mean()
+    # If skip_columns is provided, create a list of columns to check.
+    # Otherwise, check all columns.
+    cols_to_check = df.columns
+    if skip_columns:
+        # Use set difference for efficient exclusion
+        cols_to_check = df.columns.difference(skip_columns)
+    # Calculate the missing fraction only on the columns to be checked
+    missing_fraction = df[cols_to_check].isnull().mean()
     cols_to_drop = missing_fraction[missing_fraction > threshold].index
     if len(cols_to_drop) > 0:
@@ -231,6 +220,31 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
         return df
+def split_features_targets(df: pd.DataFrame, targets: list[str]):
+    """
+    Splits a DataFrame's columns into features and targets.
+    Args:
+        df (pd.DataFrame): Pandas DataFrame containing the dataset.
+        targets (list[str]): List of column names to be treated as target variables.
+    Returns:
+        tuple: A tuple containing:
+            - pd.DataFrame: Features dataframe.
+            - pd.DataFrame: Targets dataframe.
+    Prints:
+        - Shape of the original dataframe.
+        - Shape of the features dataframe.
+        - Shape of the targets dataframe.
+    """
+    valid_targets = _validate_columns(df, targets)
+    df_targets = df[valid_targets]
+    df_features = df.drop(columns=valid_targets)
+    print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
+    return df_features, df_targets
 def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
     Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
@@ -631,5 +645,10 @@ def standardize_percentages(
     return df_copy
+def _validate_columns(df: pd.DataFrame, columns: list[str]):
+    valid_columns = [column for column in columns if column in df.columns]
+    return valid_columns
 def info():
     _script_info(__all__)

{dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "4.2.2"
+version = "4.4.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }