PyPI - dragon-ml-toolbox - Versions diffs - 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

dragon-ml-toolbox 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (16) hide show

{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/METADATA +1 -1
dragon_ml_toolbox-2.2.0.dist-info/RECORD +21 -0
ml_tools/ETL_engineering.py +543 -0
ml_tools/MICE_imputation.py +27 -28
ml_tools/PSO_optimization.py +15 -15
ml_tools/VIF_factor.py +20 -17
ml_tools/data_exploration.py +58 -32
ml_tools/ensemble_learning.py +40 -42
ml_tools/handle_excel.py +98 -78
ml_tools/logger.py +13 -11
ml_tools/utilities.py +165 -60
dragon_ml_toolbox-2.0.0.dist-info/RECORD +0 -20
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/top_level.txt +0 -0

{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 2.0.0
+Version: 2.2.0
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

dragon_ml_toolbox-2.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,21 @@
+dragon_ml_toolbox-2.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-2.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
+ml_tools/ETL_engineering.py,sha256=9Lg-anXhggtdzvRPgVVSiAUGu5sb-LAZDfLDFXJlHns,21328
+ml_tools/MICE_imputation.py,sha256=1fovHycZMdZ6OgVh_bk8-r3wGi4rqf6rS10LOEWYaQo,11177
+ml_tools/PSO_optimization.py,sha256=T-wnB94DcRWuRd2M3loDVT4POtIP0MOhs-VilAf1L4E,20974
+ml_tools/VIF_factor.py,sha256=lpM3Z2X_iZfXUWbCbURoeI0Tb196lU0bAsRo7q6AzBM,10235
+ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
+ml_tools/data_exploration.py,sha256=CDUVRTHfww105IXDRpBQ81KZWx5HXSsA-FVsVYBzNw8,21298
+ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
+ml_tools/ensemble_learning.py,sha256=q9jbu7SupvXz61sURFQ9V2-7gUsLbA3cSgyb2MQFyyc,37351
+ml_tools/handle_excel.py,sha256=Uasx-DX7RNVQSzGHVJhX7UQ9RgBbX5H1ud1Hw_y8Kp4,12944
+ml_tools/logger.py,sha256=_k7WJdpFJj3IsjOgvjLJgUFZyF8RK3Jlgp5tAu_dLQU,4767
+ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
+ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
+ml_tools/utilities.py,sha256=A7Wm1ArpqFG80WKmnkYdtSzIRLvg5x-9nPNidZIbpPA,20671
+ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
+dragon_ml_toolbox-2.2.0.dist-info/METADATA,sha256=oTLE1Q6BzsIwicQM7XCumt89XAjHZcV6CxDTfyteP_w,2974
+dragon_ml_toolbox-2.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-2.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-2.2.0.dist-info/RECORD,,

ml_tools/ETL_engineering.py ADDED Viewed

@@ -0,0 +1,543 @@
+import polars as pl
+import re
+from typing import Literal, Union, Optional, Any, Callable, List, Dict
+from .utilities import _script_info
+__all__ = [
+    "TransformationRecipe",
+    "DataProcessor",
+    "KeywordDummifier",
+    "NumberExtractor",
+    "MultiNumberExtractor",
+    "CategoryMapper",
+    "ValueBinner",
+    "DateFeatureExtractor"
+]
+# Magic word for rename-only transformation
+_RENAME = "rename"
+class TransformationRecipe:
+    """
+    A builder class for creating a data transformation recipe.
+    This class provides a structured way to define a series of transformation
+    steps, with validation performed at the time of addition. It is designed
+    to be passed to a `DataProcessor`.
+    Use the method `add()` to add recipes.
+    """
+    def __init__(self):
+        self._steps: List[Dict[str, Any]] = []
+    def add(
+        self,
+        input_col_name: str,
+        output_col_names: Union[str, List[str]],
+        transform: Union[str, Callable],
+    ) -> "TransformationRecipe":
+        """
+        Adds a new transformation step to the recipe.
+        Args:
+            input_col: The name of the column from the source DataFrame.
+            output_col: The desired name(s) for the output column(s).
+                        A string for a 1-to-1 mapping, or a list of strings
+                        for a 1-to-many mapping.
+            transform: The transformation to apply:
+                - Use "rename" for simple column renaming
+                - If callable, must accept a `pl.Series` as the only parameter and return either a `pl.Series` or `pl.DataFrame`.
+        Returns:
+            The instance of the recipe itself to allow for method chaining.
+        """
+        # --- Validation ---
+        if not isinstance(input_col_name, str) or not input_col_name:
+            raise TypeError("'input_col' must be a non-empty string.")
+        if transform == _RENAME:
+            if not isinstance(output_col_names, str):
+                raise TypeError("For a RENAME operation, 'output_col' must be a string.")
+        elif not isinstance(transform, Callable):
+            raise TypeError(f"'transform' must be a callable function or the string '{_RENAME}'.")
+        if isinstance(output_col_names, list) and transform == _RENAME:
+            raise ValueError("A RENAME operation cannot have a list of output columns.")
+        # --- Add Step ---
+        step = {
+            "input_col": input_col_name,
+            "output_col": output_col_names,
+            "transform": transform,
+        }
+        self._steps.append(step)
+        return self  # Allow chaining: recipe.add(...).add(...)
+    def __iter__(self):
+        """Allows the class to be iterated over, like a list."""
+        return iter(self._steps)
+    def __len__(self):
+        """Allows the len() function to be used on an instance."""
+        return len(self._steps)
+class DataProcessor:
+    """
+    Transforms a Polars DataFrame based on a provided `TransformationRecipe` object.
+    Use the method `transform()`.
+    """
+    def __init__(self, recipe: TransformationRecipe):
+        """
+        Initializes the DataProcessor with a transformation recipe.
+        Args:
+            recipe: An instance of the `TransformationRecipe` class that has
+                    been populated with transformation steps.
+        """
+        if not isinstance(recipe, TransformationRecipe):
+            raise TypeError("The recipe must be an instance of TransformationRecipe.")
+        if len(recipe) == 0:
+            raise ValueError("The recipe cannot be empty.")
+        self.recipe = recipe
+    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
+        """
+        Applies the transformation recipe to the input DataFrame.
+        """
+        processed_columns = []
+        # Recipe object is iterable
+        for step in self.recipe:
+            input_col_name = step["input_col"]
+            output_col_spec = step["output_col"]
+            transform_action = step["transform"]
+            if input_col_name not in df.columns:
+                raise ValueError(f"Input column '{input_col_name}' not found in DataFrame.")
+            input_series = df.get_column(input_col_name)
+            if transform_action == _RENAME:
+                processed_columns.append(input_series.alias(output_col_spec))
+                continue
+            if isinstance(transform_action, Callable):
+                result = transform_action(input_series)
+                if isinstance(result, pl.Series):
+                    if not isinstance(output_col_spec, str):
+                        raise TypeError(f"Function for '{input_col_name}' returned a Series but 'output_col' is not a string.")
+                    processed_columns.append(result.alias(output_col_spec))
+                elif isinstance(result, pl.DataFrame):
+                    if not isinstance(output_col_spec, list):
+                        raise TypeError(f"Function for '{input_col_name}' returned a DataFrame but 'output_col' is not a list.")
+                    if len(result.columns) != len(output_col_spec):
+                        raise ValueError(
+                            f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, "
+                            f"but recipe specifies {len(output_col_spec)} output names."
+                        )
+                    renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
+                    processed_columns.extend(renamed_df.get_columns())
+                else:
+                    raise TypeError(f"Function for '{input_col_name}' returned an unexpected type: {type(result)}.")
+            else: # This case is now unlikely due to builder validation.
+                raise TypeError(f"Invalid 'transform' action for '{input_col_name}': {transform_action}")
+        if not processed_columns:
+            print("Warning: The transformation resulted in an empty DataFrame.")
+            return pl.DataFrame()
+        return pl.DataFrame(processed_columns)
+class KeywordDummifier:
+    """
+    A configurable transformer that creates one-hot encoded columns based on
+    keyword matching in a Polars Series.
+    Instantiate this class with keyword configurations. The instance can be used as a 'transform' callable compatible with the `TransformationRecipe`.
+    Args:
+        group_names (List[str]):
+            A list of strings, where each string is the name of a category.
+            This defines the matching priority and the base column names of the
+            DataFrame returned by the transformation.
+        group_keywords (List[List[str]]):
+            A list of lists of strings. Each inner list corresponds to a
+            `group_name` at the same index and contains the keywords to search for.
+    """
+    def __init__(self, group_names: List[str], group_keywords: List[List[str]]):
+        if len(group_names) != len(group_keywords):
+            raise ValueError("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
+        self.group_names = group_names
+        self.group_keywords = group_keywords
+    def __call__(self, column: pl.Series) -> pl.DataFrame:
+        """
+        Executes the one-hot encoding logic.
+        Args:
+            column (pl.Series): The input Polars Series to transform.
+        Returns:
+            pl.DataFrame: A DataFrame with one-hot encoded columns.
+        """
+        column = column.cast(pl.Utf8)
+        categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None))
+        for name, keywords in zip(self.group_names, self.group_keywords):
+            pattern = "|".join(re.escape(k) for k in keywords)
+            categorize_expr = categorize_expr.when(
+                column.str.contains(pattern)
+            ).then(pl.lit(name))
+        categorize_expr = categorize_expr.otherwise(None).alias("category")
+        temp_df = pl.DataFrame(categorize_expr)
+        df_with_dummies = temp_df.to_dummies(columns=["category"])
+        final_columns = []
+        for name in self.group_names:
+            dummy_col_name = f"category_{name}"
+            if dummy_col_name in df_with_dummies.columns:
+                # The alias here uses the group name as the temporary column name
+                final_columns.append(
+                    df_with_dummies.get_column(dummy_col_name).alias(name)
+                )
+            else:
+                final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
+        return pl.DataFrame(final_columns)
+class NumberExtractor:
+    """
+    A configurable transformer that extracts a single number from a Polars string series using a regular expression.
+    An instance can be used as a 'transform' callable within the
+    `DataProcessor` pipeline.
+    Args:
+        regex_pattern (str):
+            The regular expression used to find the number. This pattern
+            MUST contain exactly one capturing group `(...)`. Defaults to a standard pattern for integers and floats.
+        dtype (str):
+            The desired data type for the output column. Defaults to "float".
+        round_digits (int | None):
+            If the dtype is 'float', you can specify the number of decimal
+            places to round the result to. This parameter is ignored if
+            dtype is 'int'. Defaults to None (no rounding).
+    """
+    def __init__(
+        self,
+        regex_pattern: str = r"(\d+\.?\d*)",
+        dtype: Literal["float", "int"] = "float",
+        round_digits: Optional[int] = None,
+    ):
+        # --- Validation ---
+        if not isinstance(regex_pattern, str):
+            raise TypeError("regex_pattern must be a string.")
+        # Validate that the regex has exactly one capturing group
+        try:
+            if re.compile(regex_pattern).groups != 1:
+                raise ValueError("regex_pattern must contain exactly one capturing group '(...)'")
+        except re.error as e:
+            raise ValueError(f"Invalid regex pattern provided: {e}") from e
+        if dtype not in ["float", "int"]:
+            raise ValueError("dtype must be either 'float' or 'int'.")
+        if round_digits is not None:
+            if not isinstance(round_digits, int):
+                raise TypeError("round_digits must be an integer.")
+            if dtype == "int":
+                print(f"Warning: 'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
+        self.regex_pattern = regex_pattern
+        self.dtype = dtype
+        self.round_digits = round_digits
+        self.polars_dtype = pl.Float64 if dtype == "float" else pl.Int64
+    def __call__(self, column: pl.Series) -> pl.Series:
+        """
+        Executes the number extraction logic.
+        Args:
+            column (pl.Series): The input Polars Series to transform.
+        Returns:
+            pl.Series: A new Series containing the extracted numbers.
+        """
+        # Extract the first (and only) capturing group
+        extracted = column.str.extract(self.regex_pattern, 1)
+        # Cast to the desired numeric type. Non-matching strings become null.
+        casted = extracted.cast(self.polars_dtype, strict=False)
+        # Apply rounding only if it's a float and round_digits is set
+        if self.dtype == "float" and self.round_digits is not None:
+            return casted.round(self.round_digits)
+        return casted
+class MultiNumberExtractor:
+    """
+    Extracts multiple numbers from a single polars string column into several new columns.
+    This transformer is designed for one-to-many mappings, such as parsing
+    ratios (100:30) or coordinates (10, 25) into separate columns.
+    Args:
+        num_outputs (int):
+            Number of numeric columns to create.
+        regex_pattern (str):
+            The regex pattern to find all numbers. Must contain one
+            capturing group around the number part.
+            Defaults to a standard pattern for integers and floats.
+        dtype (str):
+            The desired data type for the output columns. Defaults to "float".
+        fill_value (int | float | None):
+            A value to fill in if a number is not found at a given position (if positive match).
+            - For example, if `num_outputs=2` and only one number is found in a string, the second output column will be filled with this value. If None, it will be filled with null.
+    """
+    def __init__(
+        self,
+        num_outputs: int,
+        regex_pattern: str = r"(\d+\.?\d*)",
+        dtype: Literal["float", "int"] = "float",
+        fill_value: Optional[Union[int, float]] = None
+    ):
+        # --- Validation ---
+        if not isinstance(num_outputs, int) or num_outputs <= 0:
+            raise ValueError("num_outputs must be a positive integer.")
+        if not isinstance(regex_pattern, str):
+            raise TypeError("regex_pattern must be a string.")
+        # Validate that the regex has exactly one capturing group
+        try:
+            if re.compile(regex_pattern).groups != 1:
+                raise ValueError("regex_pattern must contain exactly one capturing group '(...)'")
+        except re.error as e:
+            raise ValueError(f"Invalid regex pattern provided: {e}") from e
+        # Validate dtype
+        if dtype not in ["float", "int"]:
+            raise ValueError("dtype must be either 'float' or 'int'.")
+        self.num_outputs = num_outputs
+        self.regex_pattern = regex_pattern
+        self.fill_value = fill_value
+        self.polars_dtype = pl.Float64 if dtype == "float" else pl.Int64
+    def __call__(self, column: pl.Series) -> pl.DataFrame:
+        """
+        Executes the multi-number extraction logic. Preserves nulls from the input column.
+        """
+        output_expressions = []
+        for i in range(self.num_outputs):
+            # Define the core extraction logic for the i-th number
+            extraction_expr = (
+                column.str.extract_all(self.regex_pattern)
+                .list.get(i)
+                .cast(self.polars_dtype, strict=False)
+            )
+            # Apply the fill value if provided
+            if self.fill_value is not None:
+                extraction_expr = extraction_expr.fill_null(self.fill_value)
+            # Only apply the logic when the input is not null.
+            # Otherwise, the result should also be null.
+            final_expr = (
+                pl.when(column.is_not_null())
+                .then(extraction_expr)
+                .otherwise(None)
+                .alias(f"col_{i}") # Name the final output expression
+            )
+            output_expressions.append(final_expr)
+        return pl.select(output_expressions)
+class CategoryMapper:
+    """
+    A transformer that maps string categories to specified numerical values using a dictionary.
+    Ideal for ordinal encoding.
+    Args:
+        mapping (Dict[str, [int | float]]):
+            A dictionary that defines the mapping from a string category (key)
+            to a numerical value (value).
+        unseen_value (int | float | None):
+            The numerical value to use for categories that are present in the
+            data but not in the mapping dictionary. If not provided or set
+            to None, unseen categories will be mapped to a null value.
+    """
+    def __init__(
+        self,
+        mapping: Dict[str, Union[int, float]],
+        unseen_value: Optional[Union[int, float]] = None,
+    ):
+        if not isinstance(mapping, dict):
+            raise TypeError("The 'mapping' argument must be a dictionary.")
+        self.mapping = mapping
+        self.default_value = unseen_value
+    def __call__(self, column: pl.Series) -> pl.Series:
+        """
+        Applies the dictionary mapping to the input column.
+        Args:
+            column (pl.Series): The input Polars Series of categories.
+        Returns:
+            pl.Series: A new Series with categories mapped to numbers.
+        """
+        # Ensure the column is treated as a string for matching keys
+        return column.cast(pl.Utf8).map_dict(self.mapping, default=self.default_value)
+class ValueBinner:
+    """
+    A transformer that discretizes a continuous numerical column into a finite number of bins.
+    Each bin is assigned an integer label (0, 1, 2, ...).
+    Args:
+        breaks (List[int | float]):
+            A list of numbers defining the boundaries of the bins. The list
+            must be sorted in ascending order and contain at least two values.
+            For example, `breaks=[0, 18, 40, 65]` creates three bins.
+        left_closed (bool):
+            Determines which side of the interval is inclusive.
+            - If `False` (default): Intervals are (lower, upper].
+            - If `True`: Intervals are [lower, upper).
+    """
+    def __init__(
+        self,
+        breaks: List[Union[int, float]],
+        left_closed: bool = False,
+    ):
+        # --- Validation ---
+        if not isinstance(breaks, list) or len(breaks) < 2:
+            raise ValueError("The 'breaks' argument must be a list of at least two numbers.")
+        # Check if the list is sorted
+        if not all(breaks[i] <= breaks[i+1] for i in range(len(breaks)-1)):
+            raise ValueError("The 'breaks' list must be sorted in ascending order.")
+        self.breaks = breaks
+        self.left_closed = left_closed
+        # Generate numerical labels [0, 1, 2, ...] for the bins
+        self.labels = [str(i) for i in range(len(breaks) - 1)]
+    def __call__(self, column: pl.Series) -> pl.Series:
+        """
+        Applies the binning logic to the input column.
+        Args:
+            column (pl.Series): The input Polars Series of numerical data.
+        Returns:
+            pl.Series: A new Series of integer labels for the bins. Values
+                       outside the specified breaks will become null.
+        """
+        # `cut` creates a new column of type Categorical
+        binned_column = column.cut(
+            breaks=self.breaks,
+            labels=self.labels,
+            left_closed=self.left_closed
+        )
+        # to_physical() converts the Categorical type to its underlying
+        # integer representation (u32), which is perfect for ML.
+        return binned_column.to_physical()
+class DateFeatureExtractor:
+    """
+    A one-to-many transformer that extracts multiple numerical features from a date or datetime column.
+    It can handle columns that are already in a Polars Date/Datetime format,
+    or it can parse string columns if a format is provided.
+    Args:
+        features (List[str]):
+            A list of the date/time features to extract. Supported features are:
+            'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
+            'microsecond', 'nanosecond', 'ordinal_day' (day of year),
+            'weekday' (Mon=1, Sun=7), 'week' (week of year), and 'timestamp'.
+        format (str | None):
+            The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
+            Use if the input column is not a Date or Datetime type.
+    """
+    ALLOWED_FEATURES = {
+        'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
+        'microsecond', 'nanosecond', 'ordinal_day', 'weekday', 'week', 'timestamp'
+    }
+    def __init__(
+        self,
+        features: List[str],
+        format: Optional[str] = None,
+    ):
+        # --- Validation ---
+        if not isinstance(features, list) or not features:
+            raise ValueError("'features' must be a non-empty list of strings.")
+        for feature in features:
+            if feature not in self.ALLOWED_FEATURES:
+                raise ValueError(
+                    f"Feature '{feature}' is not supported. "
+                    f"Allowed features are: {self.ALLOWED_FEATURES}"
+                )
+        self.features = features
+        self.format = format
+    def __call__(self, column: pl.Series) -> pl.DataFrame:
+        """
+        Applies the feature extraction logic to the input column.
+        Args:
+            column (pl.Series): The input Polars Series of dates.
+        Returns:
+            pl.DataFrame: A DataFrame with columns for each extracted feature.
+        """
+        date_col = column
+        # First, parse strings into a datetime object if a format is given
+        if self.format is not None:
+            date_col = date_col.str.to_datetime(format=self.format, strict=False)
+        output_expressions = []
+        for i, feature in enumerate(self.features):
+            # Build the expression based on the feature name
+            if feature == 'timestamp':
+                expr = date_col.dt.timestamp(time_unit="ms")
+            else:
+                # getattr is a clean way to call methods like .dt.year(), .dt.month(), etc.
+                expr = getattr(date_col.dt, feature)()
+            # Alias with a generic name for the processor to handle
+            output_expressions.append(expr.alias(f"col_{i}"))
+        return pl.select(output_expressions)
+def info():
+    _script_info(__all__)

dragon-ml-toolbox 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl