PyPI - dragon-ml-toolbox - Versions diffs - 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl - Mend

dragon-ml-toolbox 10.2.0py3-none-any.whl → 14.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (48) hide show

{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
ml_tools/ETL_cleaning.py +72 -34
ml_tools/ETL_engineering.py +506 -70
ml_tools/GUI_tools.py +2 -1
ml_tools/MICE_imputation.py +212 -7
ml_tools/ML_callbacks.py +73 -40
ml_tools/ML_datasetmaster.py +267 -284
ml_tools/ML_evaluation.py +119 -58
ml_tools/ML_evaluation_multi.py +107 -32
ml_tools/ML_inference.py +15 -5
ml_tools/ML_models.py +234 -170
ml_tools/ML_models_advanced.py +323 -0
ml_tools/ML_optimization.py +321 -97
ml_tools/ML_scaler.py +10 -5
ml_tools/ML_trainer.py +585 -40
ml_tools/ML_utilities.py +528 -0
ml_tools/ML_vision_datasetmaster.py +1315 -0
ml_tools/ML_vision_evaluation.py +260 -0
ml_tools/ML_vision_inference.py +428 -0
ml_tools/ML_vision_models.py +627 -0
ml_tools/ML_vision_transformers.py +58 -0
ml_tools/PSO_optimization.py +10 -7
ml_tools/RNN_forecast.py +2 -0
ml_tools/SQL.py +22 -9
ml_tools/VIF_factor.py +4 -3
ml_tools/_ML_vision_recipe.py +88 -0
ml_tools/__init__.py +1 -0
ml_tools/_logger.py +0 -2
ml_tools/_schema.py +96 -0
ml_tools/constants.py +79 -0
ml_tools/custom_logger.py +164 -16
ml_tools/data_exploration.py +1092 -109
ml_tools/ensemble_evaluation.py +48 -1
ml_tools/ensemble_inference.py +6 -7
ml_tools/ensemble_learning.py +4 -3
ml_tools/handle_excel.py +1 -0
ml_tools/keys.py +80 -0
ml_tools/math_utilities.py +259 -0
ml_tools/optimization_tools.py +198 -24
ml_tools/path_manager.py +144 -45
ml_tools/serde.py +192 -0
ml_tools/utilities.py +287 -227
dragon_ml_toolbox-10.2.0.dist-info/RECORD +0 -36
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0

ml_tools/ETL_engineering.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import polars as pl
 import re
+from pathlib import Path
 from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
+from .utilities import load_dataframe, save_dataframe_filename
+from .path_manager import make_fullpath
 from ._script_info import _script_info
 from ._logger import _LOGGER
+from .constants import CHEMICAL_ELEMENT_SYMBOLS
 __all__ = [
@@ -14,11 +19,15 @@ __all__ = [
     "KeywordDummifier",
     "NumberExtractor",
     "MultiNumberExtractor",
+    "TemperatureExtractor",
+    "MultiTemperatureExtractor",
     "RatioCalculator",
+    "TriRatioCalculator",
     "CategoryMapper",
     "RegexMapper",
     "ValueBinner",
-    "DateFeatureExtractor"
+    "DateFeatureExtractor",
+    "MolecularFormulaTransformer"
 ]
 ############ TRANSFORM MAIN ####################
@@ -42,17 +51,20 @@ class TransformationRecipe:
     def add(
         self,
         input_col_name: str,
-        output_col_names: Union[str, List[str]],
         transform: Union[str, Callable],
+        output_col_names: Optional[Union[str, List[str]]] = None
     ) -> "TransformationRecipe":
         """
         Adds a new transformation step to the recipe.
         Args:
-            input_col: The name of the column from the source DataFrame.
-            output_col: The desired name(s) for the output column(s).
-                        A string for a 1-to-1 mapping, or a list of strings
-                        for a 1-to-many mapping.
+            input_col_name: The name of the column from the source DataFrame.
+            output_col_names: The desired name(s) for the output column(s).
+                        - A string for a 1-to-1 mapping.
+                        - A list of strings for a 1-to-many mapping.
+                        - A string prefix for 1-to-many mapping.
+                        - If None, the input name is used for 1-to-1 transforms,
+                          or the transformer's default names are used for 1-to-many.
             transform: The transformation to apply:
                 - Use "rename" for simple column renaming
                 - If callable, must accept a `pl.Series` as the only parameter and return either a `pl.Series` or `pl.DataFrame`.
@@ -72,10 +84,6 @@ class TransformationRecipe:
         elif not isinstance(transform, Callable):
             _LOGGER.error(f"'transform' must be a callable function or the string '{_RENAME}'.")
             raise TypeError()
-        if isinstance(output_col_names, list) and transform == _RENAME:
-            _LOGGER.error("A RENAME operation cannot have a list of output columns.")
-            raise ValueError()
         # --- Add Step ---
         step = {
@@ -99,7 +107,7 @@ class DataProcessor:
     """
     Transforms a Polars DataFrame based on a provided `TransformationRecipe` object.
-    Use the method `transform()`.
+    Use the methods `transform()` or `load_transform_save()`.
     """
     def __init__(self, recipe: TransformationRecipe):
         """
@@ -142,33 +150,53 @@ class DataProcessor:
                 result = transform_action(input_series)
                 if isinstance(result, pl.Series):
-                    if not isinstance(output_col_spec, str):
-                        _LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' is not a string.")
+                    # Default to input name if spec is None
+                    output_name = output_col_spec if output_col_spec is not None else input_col_name
+                    if not isinstance(output_name, str):
+                        _LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' must be a string or None.")
                         raise TypeError()
-                    processed_columns.append(result.alias(output_col_spec))
+                    processed_columns.append(result.alias(output_name))
                 elif isinstance(result, pl.DataFrame):
-                    # 1. Handle list-based renaming
-                    if isinstance(output_col_spec, list):
+                    # 1. Handle None in output names
+                    if output_col_spec is None:
+                        # Use the column names generated by the transformer directly
+                        processed_columns.extend(result.get_columns())
+                    # 2. Handle list-based renaming
+                    elif isinstance(output_col_spec, list):
                         if len(result.columns) != len(output_col_spec):
                             _LOGGER.error(f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, but recipe specifies {len(output_col_spec)} output names.")
                             raise ValueError()
                         renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
                         processed_columns.extend(renamed_df.get_columns())
-                    # 2. Handle a string prefix for AutoDummifier
+                    # 3. Global logic for adding a single prefix to all columns.
                     elif isinstance(output_col_spec, str):
                         prefix = output_col_spec
-                        # Replace the original name part with the desired prefix.
-                        new_names = {
-                            col: f"{prefix}{col[len(input_col_name):]}" for col in result.columns
-                        }
+                        new_names = {}
+                        for col in result.columns:
+                            # Case 1: Transformer's output column name contains the input name.
+                            # Action: Replace the input name with the desired prefix.
+                            # Example: input='color', output='color_red', prefix='spec' -> 'spec_red'
+                            if input_col_name in col:
+                                new_names[col] = col.replace(input_col_name, prefix, 1)
+                            # Case 2: Transformer's output is an independent name.
+                            # Action: Prepend the prefix to the output name.
+                            # Example: input='ratio', output='A_B', prefix='spec' -> 'spec_A_B'
+                            else:
+                                new_names[col] = f"{prefix}_{col}"
                         renamed_df = result.rename(new_names)
-                        processed_columns.extend(renamed_df.get_columns())
+                        processed_columns.extend(renamed_df.get_columns())
                     else:
-                        _LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names or a string prefix.")
+                        _LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names, a string prefix, or None.")
                         raise TypeError()
                 else:
@@ -182,9 +210,28 @@ class DataProcessor:
         if not processed_columns:
             _LOGGER.error("The transformation resulted in an empty DataFrame.")
             return pl.DataFrame()
+        _LOGGER.info(f"Processed dataframe with {len(processed_columns)} columns.")
         return pl.DataFrame(processed_columns)
+    def load_transform_save(self, input_path: Union[str,Path], output_path: Union[str,Path]):
+        """
+        Convenience wrapper for the transform method that includes automatic dataframe loading and saving.
+        """
+        # Validate paths
+        in_path = make_fullpath(input_path, enforce="file")
+        out_path = make_fullpath(output_path, make=True, enforce="file")
+        # load df
+        df, _ = load_dataframe(df_path=in_path, kind="polars", all_strings=True)
+        # Process
+        df_processed = self.transform(df)
+        # save processed df
+        save_dataframe_filename(df=df_processed, save_dir=out_path.parent, filename=out_path.name)
     def __str__(self) -> str:
         """
         Provides a detailed, human-readable string representation of the
@@ -253,7 +300,7 @@ class BinaryTransformer:
             _LOGGER.error("Provide either 'true_keywords' or 'false_keywords', but not both.")
             raise ValueError()
         if true_keywords is None and false_keywords is None:
-            _LOGGER.error("You must provide either 'true_keywords' or 'false_keywords'.")
+            _LOGGER.error("Provide either 'true_keywords' or 'false_keywords'.")
             raise ValueError()
         # --- Configuration ---
@@ -285,16 +332,17 @@ class BinaryTransformer:
         Returns:
             pl.Series: A new Series of type UInt8 containing 1s and 0s.
         """
+        column_base_name = column.name
         # Create a boolean Series: True if any keyword is found, else False
         contains_keyword = column.str.contains(self.pattern)
         # Apply logic and cast directly to integer type
         if self.mode == "true_mode":
             # True -> 1, False -> 0
-            return contains_keyword.cast(pl.UInt8)
+            return contains_keyword.cast(pl.UInt8).alias(column_base_name)
         else: # false_mode
             # We want the inverse: True -> 0, False -> 1
-            return (~contains_keyword).cast(pl.UInt8)
+            return (~contains_keyword).cast(pl.UInt8).alias(column_base_name)
 class AutoDummifier:
@@ -302,6 +350,15 @@ class AutoDummifier:
     A transformer that performs one-hot encoding on a categorical column,
     automatically detecting the unique categories from the data.
     """
+    def __init__(self, drop_first: bool = False):
+        """
+        Initializes the AutoDummifier.
+        Args:
+            drop_first (bool): If True, drops the first dummy column.
+        """
+        self.drop_first = drop_first
     def __call__(self, column: pl.Series) -> pl.DataFrame:
         """
         Executes the one-hot encoding logic.
@@ -314,8 +371,20 @@ class AutoDummifier:
                           Column names are auto-generated by Polars as
                           '{original_col_name}_{category_value}'.
         """
-        # Ensure the column is treated as a string before creating dummies
-        return column.cast(pl.Utf8).to_dummies()
+        # Store the original column name to construct the potential null column name
+        col_name = column.name
+        # Create the dummy variables from the series
+        dummies = column.cast(pl.Utf8).to_dummies(drop_first=self.drop_first)
+        # Define the name of the column that Polars creates for null values
+        null_col_name = f"{col_name}_null"
+        # Check if the null column exists and drop it if it does
+        if null_col_name in dummies.columns:
+            return dummies.drop(null_col_name)
+        return dummies
 class MultiBinaryDummifier:
@@ -332,7 +401,7 @@ class MultiBinaryDummifier:
             A list of strings, where each string is a keyword to search for. A separate
             binary column will be created for each keyword.
         case_insensitive (bool):
-            If True, keyword matching ignores case. Defaults to True.
+            If True, keyword matching ignores case.
     """
     def __init__(self, keywords: List[str], case_insensitive: bool = True):
         if not isinstance(keywords, list) or not all(isinstance(k, str) for k in keywords):
@@ -355,11 +424,12 @@ class MultiBinaryDummifier:
         Returns:
             pl.DataFrame: A DataFrame where each column corresponds to a keyword.
         """
+        column_base_name = column.name
         # Ensure the input is treated as a string, preserving nulls
         str_column = column.cast(pl.Utf8)
         output_expressions = []
-        for i, keyword in enumerate(self.keywords):
+        for keyword in self.keywords:
             # Escape keyword to treat it as a literal, not a regex pattern
             base_pattern = re.escape(keyword)
@@ -373,7 +443,7 @@ class MultiBinaryDummifier:
                 .when(str_column.str.contains(pattern))
                 .then(pl.lit(1, dtype=pl.UInt8))
                 .otherwise(pl.lit(0, dtype=pl.UInt8))
-                .alias(f"col_{i}") # Generic name for DataProcessor
+                .alias(f"{column_base_name}_{keyword}") # name for DataProcessor
             )
             output_expressions.append(expr)
@@ -417,6 +487,7 @@ class KeywordDummifier:
         Returns:
             pl.DataFrame: A DataFrame with one-hot encoded columns.
         """
+        column_base_name = column.name
         column = column.cast(pl.Utf8)
         categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None, dtype=pl.Utf8))
@@ -435,22 +506,24 @@ class KeywordDummifier:
                 column.str.contains(pattern)
             ).then(pl.lit(name))
-        categorize_expr = categorize_expr.otherwise(None).alias("category")
+        dummy_name = 'dummy_category'
+        categorize_expr = categorize_expr.otherwise(None).alias(dummy_name)
         temp_df = pl.select(categorize_expr)
-        df_with_dummies = temp_df.to_dummies(columns=["category"])
+        df_with_dummies = temp_df.to_dummies(columns=[dummy_name])
         final_columns = []
         for name in self.group_names:
-            dummy_col_name = f"category_{name}"
+            dummy_col_name = f"{dummy_name}_{name}"
             if dummy_col_name in df_with_dummies.columns:
-                # The alias here uses the group name as the temporary column name
+                # The alias here uses the group name as the final column name
                 final_columns.append(
-                    df_with_dummies.get_column(dummy_col_name).alias(name)
+                    df_with_dummies.get_column(dummy_col_name).alias(f"{column_base_name}_{name}")
                 )
             else:
                 # If a group had no matches, create a column of zeros
-                final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
+                final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(f"{column_base_name}_{name}"))
         return pl.select(final_columns)
@@ -471,7 +544,7 @@ class NumberExtractor:
         round_digits (int | None):
             If the dtype is 'float', you can specify the number of decimal
             places to round the result to. This parameter is ignored if
-            dtype is 'int'. Defaults to None (no rounding).
+            dtype is 'int'.
     """
     def __init__(
         self,
@@ -519,6 +592,7 @@ class NumberExtractor:
         Returns:
             pl.Series: A new Series containing the extracted numbers.
         """
+        column_base_name = column.name
         # Extract the first (and only) capturing group
         extracted = column.str.extract(self.regex_pattern, 1)
@@ -529,7 +603,7 @@ class NumberExtractor:
         if self.dtype == "float" and self.round_digits is not None:
             return casted.round(self.round_digits)
-        return casted
+        return casted.alias(column_base_name)
 class MultiNumberExtractor:
@@ -590,12 +664,13 @@ class MultiNumberExtractor:
         """
         Executes the multi-number extraction logic. Preserves nulls from the input column.
         """
+        column_base_name = column.name
         output_expressions = []
         for i in range(self.num_outputs):
             # Define the core extraction logic for the i-th number
             extraction_expr = (
                 column.str.extract_all(self.regex_pattern)
-                .list.get(i)
+                .list.get(i, null_on_oob=True)
                 .cast(self.polars_dtype, strict=False)
             )
@@ -609,24 +684,214 @@ class MultiNumberExtractor:
                 pl.when(column.is_not_null())
                 .then(extraction_expr)
                 .otherwise(None)
-                .alias(f"col_{i}") # Name the final output expression
+                .alias(f"{column_base_name}_{i}") # Name the final output expression
+            )
+            output_expressions.append(final_expr)
+        return pl.select(output_expressions)
+class TemperatureExtractor:
+    """
+    Extracts temperature values from a string column.
+    This transformer assumes that the source temperature values are in Celsius.
+    It can extract a single value using a specific regex or find all numbers in
+    a string and calculate their average. It also supports converting the final
+    Celsius value to Kelvin or Rankine.
+    Args:
+        regex_pattern (str):
+            The regex to find a single temperature. MUST contain exactly one
+            capturing group `(...)`. This is ignored if `average_mode` is True.
+        average_mode (bool):
+            If True, extracts all numbers from the string and returns their average.
+            This overrides the `regex_pattern` with a generic number-finding regex.
+        convert (str | None):
+            If "K", converts the final Celsius value to Kelvin.
+            If "R", converts the final Celsius value to Rankine.
+            If None (default), the value remains in Celsius.
+    """
+    def __init__(
+        self,
+        regex_pattern: str = r"(\d+\.?\d*)",
+        average_mode: bool = False,
+        convert: Optional[Literal["K", "R"]] = None,
+    ):
+        # --- Store configuration ---
+        self.average_mode = average_mode
+        self.convert = convert
+        self.regex_pattern = regex_pattern
+        # Generic pattern for average mode, defined once for efficiency.
+        self._avg_mode_pattern = r"(\d+\.?\d*)"
+        # --- Validation ---
+        if not self.average_mode:
+            try:
+                if re.compile(self.regex_pattern).groups != 1:
+                    _LOGGER.error("'regex_pattern' must contain exactly one capturing group '(...)' for single extraction mode.")
+                    raise ValueError()
+            except re.error as e:
+                _LOGGER.error(f"Invalid regex pattern provided: {e}")
+                raise ValueError()
+        if self.convert is not None and self.convert not in ["K", "R"]:
+            _LOGGER.error("'convert' must be either 'K' (Kelvin) or 'R' (Rankine).")
+            raise ValueError()
+    def __call__(self, column: pl.Series) -> pl.Series:
+        """
+        Applies the temperature extraction and conversion logic.
+        Args:
+            column (pl.Series): The input Polars Series with string data.
+        Returns:
+            pl.Series: A new Series containing the final temperature values as floats.
+        """
+        column_base_name = column.name
+        # --- Step 1: Extract number(s) to get a Celsius value expression ---
+        if self.average_mode:
+            # Extract all numbers and compute their mean. Polars' list.mean()
+            # handles the casting to float automatically.
+            celsius_expr = (
+                column.str.extract_all(self._avg_mode_pattern)
+                .list.eval(pl.element().cast(pl.Float64, strict=False))
+                .list.mean()
+            )
+        else:
+            # Extract a single number using the specified pattern.
+            # Cast to Float64, with non-matches becoming null.
+            celsius_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
+        # --- Step 2: Apply conversion if specified ---
+        if self.convert == "K":
+            # Celsius to Kelvin: C + 273.15
+            final_expr = celsius_expr + 273.15
+        elif self.convert == "R":
+            # Celsius to Rankine: (C * 9/5) + 491.67
+            final_expr = (celsius_expr * 1.8) + 491.67
+        else:
+            # No conversion needed
+            final_expr = celsius_expr
+        # --- Step 3: Round the result and return as a Series ---
+        # The select().to_series() pattern is a robust way to execute an
+        # expression and guarantee a Series is returned.
+        return pl.select(final_expr.round(2)).to_series().alias(column_base_name)
+class MultiTemperatureExtractor:
+    """
+    Extracts multiple temperature values from a single string column into
+    several new columns, assuming the source values are in Celsius.
+    This one-to-many transformer is designed for cases where multiple readings
+    are packed into one field, like "Min: 10C, Max: 25C".
+    Args:
+        num_outputs (int):
+            The number of numeric columns to create.
+        regex_pattern (str):
+            The regex to find all numbers. Must contain exactly one capturing
+            group around the number part (e.g., r"(-?\\d+\\.?\\d*)").
+        convert (str | None):
+            If "K", converts the final Celsius values to Kelvin.
+            If "R", converts the final Celsius values to Rankine.
+            If None (default), the values remain in Celsius.
+        fill_value (int | float | None):
+            A value to use if a temperature is not found at a given position.
+            For example, if `num_outputs=3` and only two temperatures are
+            found, the third column will be filled with this value. If None,
+            it will be filled with null.
+    """
+    def __init__(
+        self,
+        num_outputs: int,
+        regex_pattern: str = r"(\d+\.?\d*)",
+        convert: Optional[Literal["K", "R"]] = None,
+        fill_value: Optional[Union[int, float]] = None
+    ):
+        # --- Validation ---
+        if not isinstance(num_outputs, int) or num_outputs <= 0:
+            _LOGGER.error("'num_outputs' must be a positive integer.")
+            raise ValueError()
+        try:
+            if re.compile(regex_pattern).groups != 1:
+                _LOGGER.error("'regex_pattern' must contain exactly one capturing group '(...)'.")
+                raise ValueError()
+        except re.error as e:
+            _LOGGER.error(f"Invalid regex pattern provided: {e}")
+            raise ValueError()
+        if convert is not None and convert not in ["K", "R"]:
+            _LOGGER.error("'convert' must be either 'K' (Kelvin) or 'R' (Rankine).")
+            raise ValueError()
+        # --- Store configuration ---
+        self.num_outputs = num_outputs
+        self.regex_pattern = regex_pattern
+        self.convert = convert
+        self.fill_value = fill_value
+    def __call__(self, column: pl.Series) -> pl.DataFrame:
+        """
+        Applies the multi-temperature extraction and conversion logic.
+        """
+        column_base_name = column.name
+        output_expressions = []
+        for i in range(self.num_outputs):
+            # --- Step 1: Extract the i-th number as a Celsius value ---
+            celsius_expr = (
+                column.str.extract_all(self.regex_pattern)
+                .list.get(i, null_on_oob=True)
+                .cast(pl.Float64, strict=False)
+            )
+            # --- Step 2: Apply conversion if specified ---
+            if self.convert == "K":
+                # Celsius to Kelvin: C + 273.15
+                converted_expr = celsius_expr + 273.15
+            elif self.convert == "R":
+                # Celsius to Rankine: (C * 9/5) + 491.67
+                converted_expr = (celsius_expr * 1.8) + 491.67
+            else:
+                # No conversion needed
+                converted_expr = celsius_expr
+            # --- Step 3: Apply fill value and handle original nulls ---
+            final_expr = converted_expr.round(2)
+            if self.fill_value is not None:
+                final_expr = final_expr.fill_null(self.fill_value)
+            # Ensure that if the original row was null, all outputs are null
+            final_expr = (
+                pl.when(column.is_not_null())
+                .then(final_expr)
+                .otherwise(None)
+                .alias(f"{column_base_name}_{i}") # Temporary name for DataProcessor
             )
             output_expressions.append(final_expr)
+        # Execute all expressions at once for performance
         return pl.select(output_expressions)
 class RatioCalculator:
     """
     A transformer that parses a string ratio (e.g., "40:5" or "30/2") and
-    computes the result of the division. It gracefully handles strings that
-    do not match the pattern by returning null.
+    computes the result of the division. Includes robust handling for
+    zeros and single numbers.
     """
     def __init__(
         self,
-        # Default pattern includes the full-width colon '：'
-        regex_pattern: str = r"(\d+\.?\d*)\s*[:：/]\s*(\d+\.?\d*)"
+        regex_pattern: str = r"(\d+\.?\d*)\s*[:：/]\s*(\d+\.?\d*)",
+        handle_zeros: bool = False,
+        handle_single_number: bool = False
     ):
         # --- Robust Validation ---
         try:
@@ -642,24 +907,119 @@ class RatioCalculator:
             raise ValueError()
         self.regex_pattern = regex_pattern
+        self.handle_zeros = handle_zeros
+        self.handle_single_number = handle_single_number
     def __call__(self, column: pl.Series) -> pl.Series:
         """
-        Applies the ratio calculation logic to the input column.
-        This version uses .str.extract() for maximum stability.
+        Applies the ratio calculation logic to the input column. Uses .str.extract() for maximum stability and includes optional handling for zeros and single numbers.
         """
+        column_base_name = column.name
         # Extract numerator (group 1) and denominator (group 2) separately.
         numerator_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
         denominator_expr = column.str.extract(self.regex_pattern, 2).cast(pl.Float64, strict=False)
-        # Calculate the ratio, handling division by zero.
-        final_expr = pl.when(denominator_expr != 0).then(
-            numerator_expr / denominator_expr
-        ).otherwise(
-            None # Handles both null denominators and division by zero
-        )
+        # --- Logic for Requirement A: Special zero handling ---
+        if self.handle_zeros:
+            ratio_expr = (
+                pl.when(numerator_expr.is_not_null() & denominator_expr.is_not_null())
+                .then(
+                    pl.when((numerator_expr == 0) & (denominator_expr == 0)).then(pl.lit(0.0))
+                    .when((numerator_expr != 0) & (denominator_expr == 0)).then(numerator_expr)
+                    .when((numerator_expr == 0) & (denominator_expr != 0)).then(denominator_expr)
+                    .otherwise(numerator_expr / denominator_expr)  # Default: both are non-zero
+                )
+            )
+        else:
+            # Original logic
+            ratio_expr = pl.when(denominator_expr != 0).then(
+                numerator_expr / denominator_expr
+            ).otherwise(
+                None # Handles null denominators and division by zero
+            )
+        # --- Logic for Requirement B: Handle single numbers as a fallback ---
+        if self.handle_single_number:
+            # Regex to match a string that is ONLY a valid float/int
+            single_number_regex = r"^\d+\.?\d*$"
+            single_number_expr = (
+                pl.when(column.str.contains(single_number_regex))
+                .then(column.cast(pl.Float64, strict=False))
+                .otherwise(None)
+            )
+            # If ratio_expr is null, try to fill it with single_number_expr
+            final_expr = ratio_expr.fill_null(single_number_expr)
+        else:
+            final_expr = ratio_expr
+        return pl.select(final_expr.round(4)).to_series().alias(column_base_name)
+class TriRatioCalculator:
+    """
+    A transformer that handles three-part ("A:B:C") ratios, enforcing a strict output structure.
+    - Three-part ratios produce A/B and A/C.
+    - Two-part ratios are assumed to be A:C and produce None for A/B.
+    - Single values produce None for both outputs.
+    """
+    def __init__(self, handle_zeros: bool = False):
+        """
+        Initializes the TriRatioCalculator.
-        return pl.select(final_expr.round(4)).to_series()
+        Args:
+            handle_zeros (bool): If True, returns a valid value if either the denominator or numerator is zero; returns zero if both are zero.
+        """
+        self.handle_zeros = handle_zeros
+    def _calculate_ratio(self, num: pl.Expr, den: pl.Expr) -> pl.Expr:
+        """Helper to contain the core division logic."""
+        if self.handle_zeros:
+            # Special handling for zeros
+            expr = (
+                pl.when((num == 0) & (den == 0)).then(pl.lit(0.0))
+                .when((num != 0) & (den == 0)).then(num) # Return numerator
+                .when((num == 0) & (den != 0)).then(den) # Return denominator
+                .otherwise(num / den)
+            )
+        else:
+            # Default behavior: return null if denominator is 0
+            expr = pl.when(den != 0).then(num / den).otherwise(None)
+        return expr.round(4)
+    def __call__(self, column: pl.Series) -> pl.DataFrame:
+        """
+        Applies the robust tri-ratio logic using the lazy API.
+        """
+        column_base_name = column.name
+        # Wrap the input Series in a DataFrame to use the lazy expression API
+        temp_df = column.to_frame()
+        # Define all steps as lazy expressions
+        all_numbers_expr = pl.col(column.name).str.extract_all(r"(\d+\.?\d*)")
+        num_parts_expr = all_numbers_expr.list.len()
+        expr_A = all_numbers_expr.list.get(0, null_on_oob=True).cast(pl.Float64)
+        expr_B = all_numbers_expr.list.get(1, null_on_oob=True).cast(pl.Float64)
+        expr_C = all_numbers_expr.list.get(2, null_on_oob=True).cast(pl.Float64)
+        # Define logic for each output column using expressions
+        ratio_ab_expr = pl.when(num_parts_expr == 3).then(
+            self._calculate_ratio(expr_A, expr_B)
+        ).otherwise(None)
+        ratio_ac_expr = pl.when(num_parts_expr == 3).then(
+            self._calculate_ratio(expr_A, expr_C)
+        ).when(num_parts_expr == 2).then(
+            self._calculate_ratio(expr_A, expr_B) # B is actually C in this case
+        ).otherwise(None)
+        # Execute the expressions and return the final DataFrame
+        return temp_df.select(
+            ratio_ab_expr.alias(f"{column_base_name}_A_to_B"),
+            ratio_ac_expr.alias(f"{column_base_name}_A_to_C")
+        )
 class CategoryMapper:
@@ -699,6 +1059,7 @@ class CategoryMapper:
         Returns:
             pl.Series: A new Series with categories mapped to numbers.
         """
+        column_base_name = column.name
         # Ensure the column is treated as a string for matching keys
         str_column = column.cast(pl.Utf8)
@@ -715,7 +1076,7 @@ class CategoryMapper:
             pl.lit(self.default_value)
         )
-        return pl.select(final_expr).to_series()
+        return pl.select(final_expr).to_series().alias(column_base_name)
 class RegexMapper:
@@ -779,6 +1140,7 @@ class RegexMapper:
             pl.Series: A new Series with strings mapped to numbers based on
                        the first matching regex pattern.
         """
+        column_base_name = column.name
         # pl.String is the modern alias for pl.Utf8
         str_column = column.cast(pl.String)
@@ -793,7 +1155,7 @@ class RegexMapper:
                 .otherwise(mapping_expr)
             )
-        return pl.select(mapping_expr).to_series()
+        return pl.select(mapping_expr).to_series().alias(column_base_name)
 class ValueBinner:
@@ -843,6 +1205,7 @@ class ValueBinner:
             pl.Series: A new Series of integer labels for the bins. Values
                        outside the specified breaks will become null.
         """
+        column_base_name = column.name
         # `cut` creates a new column of type Categorical
         binned_column = column.cut(
             breaks=self.breaks,
@@ -852,7 +1215,7 @@ class ValueBinner:
         # to_physical() converts the Categorical type to its underlying
         # integer representation (u32), which is perfect for ML.
-        return binned_column.to_physical()
+        return binned_column.to_physical().alias(column_base_name)
 class DateFeatureExtractor:
@@ -861,16 +1224,6 @@ class DateFeatureExtractor:
     It can handle columns that are already in a Polars Date/Datetime format,
     or it can parse string columns if a format is provided.
-    Args:
-        features (List[str]):
-            A list of the date/time features to extract. Supported features are:
-            'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
-            'microsecond', 'nanosecond', 'ordinal_day' (day of year),
-            'weekday' (Mon=1, Sun=7), 'week' (week of year), and 'timestamp'.
-        format (str | None):
-            The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
-            Use if the input column is not a Date or Datetime type.
     """
     ALLOWED_FEATURES = {
@@ -883,6 +1236,17 @@ class DateFeatureExtractor:
         features: List[str],
         format: Optional[str] = None,
     ):
+        """
+        Args:
+            features (List[str]):
+                A list of the date/time features to extract. Supported features are:
+                'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
+                'microsecond', 'nanosecond', 'ordinal_day' (day of year),
+                'weekday' (Mon=1, Sun=7), 'week' (week of year), 'timestamp'.
+            format (str | None):
+                The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
+                Use if the input column is not a Date or Datetime type.
+        """
         # --- Validation ---
         if not isinstance(features, list) or not features:
             _LOGGER.error("'features' must be a non-empty list of strings.")
@@ -906,6 +1270,7 @@ class DateFeatureExtractor:
         Returns:
             pl.DataFrame: A DataFrame with columns for each extracted feature.
         """
+        column_base_name = column.name
         date_col = column
         # First, parse strings into a datetime object if a format is given
         if self.format is not None:
@@ -921,10 +1286,81 @@ class DateFeatureExtractor:
                 expr = getattr(date_col.dt, feature)()
             # Alias with a generic name for the processor to handle
-            output_expressions.append(expr.alias(f"col_{i}"))
+            output_expressions.append(expr.alias(f"{column_base_name}_{feature}"))
         return pl.select(output_expressions)
+class MolecularFormulaTransformer:
+    """
+    Parses a Polars Series of molecular formula strings into a wide DataFrame.
+    This one-to-many transformer takes a column of condensed molecular formulas
+    (e.g., 'Li0.115Mn0.529Ni0.339O2') and converts it into a DataFrame where
+    each chemical element has its own column. The value in each column is the
+    stoichiometric quantity of that element.
+    It is designed to be used within the DataProcessor pipeline.
+    """
+    def __init__(self):
+        """
+        Initializes the transformer and pre-compiles the regex pattern.
+        """
+        # Sort symbols by length to prevent matching 'C' in 'Co'
+        sorted_symbols = sorted(CHEMICAL_ELEMENT_SYMBOLS, key=len, reverse=True)
+        # Pre-compile regex for efficiency
+        self.pattern = re.compile(rf'({"|".join(sorted_symbols)})(\d*\.?\d*)')
+    def __call__(self, column: pl.Series) -> pl.DataFrame:
+        """
+        Executes the formula parsing logic.
+        Args:
+            column: A Polars Series containing strings of molecular formulas.
+        Returns:
+            A Polars DataFrame with columns for every chemical element.
+        """
+        column_base_name = column.name
+        def parse_formula(formula: str) -> dict:
+            """Helper to parse a single formula string into a dictionary."""
+            if not isinstance(formula, str) or not formula:
+                return {}
+            matches = self.pattern.findall(formula)
+            # This dict comprehension is correct for your use case where
+            # each element appears only once in the formula string.
+            return {
+                element: float(value) if value else 1.0
+                for element, value in matches
+            }
+        # Apply the parsing function to each element
+        parsed_series = column.map_elements(parse_formula, return_dtype=pl.Object)
+        # Convert the Series of dictionaries into a DataFrame
+        df = pl.DataFrame(parsed_series.to_list())
+        # Ensure all possible element columns are created, filling with 0
+        select_expressions = []
+        for symbol in CHEMICAL_ELEMENT_SYMBOLS:
+            col_name = f"{column_base_name}_{symbol}"
+            if symbol in df.columns:
+                expr = pl.col(symbol).fill_null(0).alias(col_name)
+            else:
+                expr = pl.lit(0.0, dtype=pl.Float64).alias(col_name)
+            select_expressions.append(expr)
+        # Handle edge case where input series is not empty but parsing yields no rows
+        base_df = df
+        if df.height == 0 and column.len() > 0:
+            base_df = pl.DataFrame({'dummy': range(column.len())})
+        return base_df.select(select_expressions)
 def info():
     _script_info(__all__)

dragon-ml-toolbox 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 10.2.0py3-none-any.whl → 14.2.0py3-none-any.whl