PyPI - dragon-ml-toolbox - Versions diffs - 10.15.0__py3-none-any.whl → 11.1.0__py3-none-any.whl - Mend

dragon-ml-toolbox 10.15.0py3-none-any.whl → 11.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (9) hide show

{dragon_ml_toolbox-10.15.0.dist-info → dragon_ml_toolbox-11.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.15.0
+Version: 11.1.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -139,6 +139,7 @@ pip install "dragon-ml-toolbox[pytorch]"
 #### Modules:
 ```bash
+constants
 custom_logger
 data_exploration
 ensemble_evaluation
@@ -176,6 +177,7 @@ pip install "dragon-ml-toolbox[mice]"
 #### Modules:
 ```Bash
+constants
 custom_logger
 MICE_imputation
 VIF_factor
@@ -196,6 +198,7 @@ pip install "dragon-ml-toolbox[excel]"
 #### Modules:
 ```Bash
+constants
 custom_logger
 handle_excel
 path_manager
@@ -218,6 +221,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
 #### Modules:
 ```Bash
+constants
 custom_logger
 GUI_tools
 ensemble_inference
@@ -241,6 +245,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
 #### Modules:
 ```Bash
+constants
 custom_logger
 GUI_tools
 ML_models

{dragon_ml_toolbox-10.15.0.dist-info → dragon_ml_toolbox-11.1.0.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-dragon_ml_toolbox-10.15.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
-dragon_ml_toolbox-10.15.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
-ml_tools/ETL_cleaning.py,sha256=ECR3UwRMovifvDkVCyqmGDGlVhWst2eJS821NsRWny8,19851
-ml_tools/ETL_engineering.py,sha256=a6KCWH6kRatZtjaFEF_o917ApPMK5_vRD-BjfCDAl-E,49400
+dragon_ml_toolbox-11.1.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-11.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
+ml_tools/ETL_cleaning.py,sha256=-JrYkT8AvkZFK-Agzhp6uVxaZXzFw49t0txjf6Z1Apw,20365
+ml_tools/ETL_engineering.py,sha256=pzv1WngYzdLo6eZX_JWRRAxNB0O4RvTaZzv5oj41WWA,54565
 ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
 ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
 ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
@@ -20,6 +20,7 @@ ml_tools/VIF_factor.py,sha256=MkMh_RIdsN2XUPzKNGRiEcmB17R_MmvGV4ezpL5zD2E,10403
 ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
 ml_tools/_logger.py,sha256=wcImAiXEZKPNcwM30qBh3t7HvoPURonJY0nrgMGF0sM,4719
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
+ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
 ml_tools/custom_logger.py,sha256=ry43hk54K6xKo8jRAgq1sFxUpOA9T0LIJ7sw0so2BW0,5880
 ml_tools/data_exploration.py,sha256=-aTi5jmv4AepPgi2k_85qEJsSLx5zPOtTbhorqzUvGQ,38542
 ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
@@ -30,7 +31,7 @@ ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
 ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
 ml_tools/path_manager.py,sha256=ke0MYOhYheRPX599GUbrvRsYHn2JKUmMDldS5LP6LQA,18431
 ml_tools/utilities.py,sha256=uheMUjQJ1zI69gASsE-mCq4KlRPVGgrgqson02rGNYM,30755
-dragon_ml_toolbox-10.15.0.dist-info/METADATA,sha256=2yN59s4nNgI3WbfE5l4-OyYmhjMQmB9uH3VYhjjprmI,6608
-dragon_ml_toolbox-10.15.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-10.15.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-10.15.0.dist-info/RECORD,,
+dragon_ml_toolbox-11.1.0.dist-info/METADATA,sha256=FvLmg4zkxGRpVyf-vt5DqKpSMY9GecfVd6MAbvPBA-Q,6657
+dragon_ml_toolbox-11.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-11.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-11.1.0.dist-info/RECORD,,

ml_tools/ETL_cleaning.py CHANGED Viewed

@@ -19,20 +19,26 @@ __all__ = [
 ################ Unique Values per column #################
-def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path], verbose: bool=False) -> None:
+def save_unique_values(csv_path: Union[str, Path],
+                       output_dir: Union[str, Path],
+                       verbose: bool=False,
+                       keep_column_order: bool = True) -> None:
     """
     Loads a CSV file, then analyzes it and saves the unique non-null values
     from each column into a separate text file exactly as they appear.
     This is useful for understanding the raw categories or range of values
-    within a dataset before cleaning.
+    within a dataset before and after cleaning.
     Args:
-        csv_path (Union[str, Path]):
+        csv_path (str | Path):
             The file path to the input CSV file.
-        output_dir (Union[str, Path]):
+        output_dir (str | Path):
             The path to the directory where the .txt files will be saved.
             The directory will be created if it does not exist.
+        keep_column_order (bool):
+            If True, prepends a numeric prefix (e.g., '1_', '2_') to each
+            output filename to maintain the original column order.
     """
     # --- 1. Input Validation ---
     csv_path = make_fullpath(input_path=csv_path, enforce="file")
@@ -74,7 +80,12 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
         sanitized_name = sanitize_filename(column_name)
         if not sanitized_name.strip('_'):
             sanitized_name = f'column_{i}'
-        file_path = output_dir / f"{sanitized_name}_unique_values.txt"
+        # --- create filename prefix ---
+        # If keep_column_order is True, create a prefix like "1_", "2_", etc.
+        prefix = f"{i + 1}_" if keep_column_order else ''
+        file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
         # --- Write to file ---
         try:
@@ -126,9 +137,10 @@ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
         'ｓ': 's', 'ｔ': 't', 'ｕ': 'u', 'ｖ': 'v', 'ｗ': 'w', 'ｘ': 'x',
         'ｙ': 'y', 'ｚ': 'z',
         # Punctuation
-        '》': '>', '《': '<', '：': ':', '。': '.', '；': ';', '【': '[', '】': ']',
+        '》': '>', '《': '<', '：': ':', '。': '.', '；': ';', '【': '[', '】': ']', '∼': '~',
         '（': '(', '）': ')', '？': '?', '！': '!', '～': '~', '＠': '@', '＃': '#', '＋': '+', '－': '-',
-        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '-', '｜': '|', '≈':'=', '·': '-',
+        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '-', '｜': '|', '≈':'=', '·': '', '⋅': '',
+        '¯': '-',
         # Commas (avoid commas in entries)
         '，': ';',
@@ -136,6 +148,8 @@ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
         '、':';',
         # Others
+        'σ': '',
+        '□': '',
         '©': '',
         '®': '',
         '™': '',
@@ -143,7 +157,6 @@ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
         # Replace special characters in entries
         r'\\': '_',
-        # '/': '_', # keep forward slash
         # Typographical standardization
         # Unify various dashes and hyphens to a standard hyphen

ml_tools/ETL_engineering.py CHANGED Viewed

@@ -6,6 +6,7 @@ from .utilities import load_dataframe, save_dataframe
 from .path_manager import make_fullpath
 from ._script_info import _script_info
 from ._logger import _LOGGER
+from .constants import CHEMICAL_ELEMENT_SYMBOLS
 __all__ = [
@@ -24,7 +25,8 @@ __all__ = [
     "CategoryMapper",
     "RegexMapper",
     "ValueBinner",
-    "DateFeatureExtractor"
+    "DateFeatureExtractor",
+    "MolecularFormulaTransformer"
 ]
 ############ TRANSFORM MAIN ####################
@@ -48,17 +50,20 @@ class TransformationRecipe:
     def add(
         self,
         input_col_name: str,
-        output_col_names: Union[str, List[str]],
         transform: Union[str, Callable],
+        output_col_names: Optional[Union[str, List[str]]] = None
     ) -> "TransformationRecipe":
         """
         Adds a new transformation step to the recipe.
         Args:
-            input_col: The name of the column from the source DataFrame.
-            output_col: The desired name(s) for the output column(s).
-                        A string for a 1-to-1 mapping, or a list of strings
-                        for a 1-to-many mapping.
+            input_col_name: The name of the column from the source DataFrame.
+            output_col_names: The desired name(s) for the output column(s).
+                        - A string for a 1-to-1 mapping.
+                        - A list of strings for a 1-to-many mapping.
+                        - A string prefix for 1-to-many mapping.
+                        - If None, the input name is used for 1-to-1 transforms,
+                          or the transformer's default names are used for 1-to-many.
             transform: The transformation to apply:
                 - Use "rename" for simple column renaming
                 - If callable, must accept a `pl.Series` as the only parameter and return either a `pl.Series` or `pl.DataFrame`.
@@ -78,10 +83,6 @@ class TransformationRecipe:
         elif not isinstance(transform, Callable):
             _LOGGER.error(f"'transform' must be a callable function or the string '{_RENAME}'.")
             raise TypeError()
-        if isinstance(output_col_names, list) and transform == _RENAME:
-            _LOGGER.error("A RENAME operation cannot have a list of output columns.")
-            raise ValueError()
         # --- Add Step ---
         step = {
@@ -105,7 +106,7 @@ class DataProcessor:
     """
     Transforms a Polars DataFrame based on a provided `TransformationRecipe` object.
-    Use the method `transform()`.
+    Use the methods `transform()` or `load_transform_save()`.
     """
     def __init__(self, recipe: TransformationRecipe):
         """
@@ -148,33 +149,53 @@ class DataProcessor:
                 result = transform_action(input_series)
                 if isinstance(result, pl.Series):
-                    if not isinstance(output_col_spec, str):
-                        _LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' is not a string.")
+                    # Default to input name if spec is None
+                    output_name = output_col_spec if output_col_spec is not None else input_col_name
+                    if not isinstance(output_name, str):
+                        _LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' must be a string or None.")
                         raise TypeError()
-                    processed_columns.append(result.alias(output_col_spec))
+                    processed_columns.append(result.alias(output_name))
                 elif isinstance(result, pl.DataFrame):
-                    # 1. Handle list-based renaming
-                    if isinstance(output_col_spec, list):
+                    # 1. Handle None in output names
+                    if output_col_spec is None:
+                        # Use the column names generated by the transformer directly
+                        processed_columns.extend(result.get_columns())
+                    # 2. Handle list-based renaming
+                    elif isinstance(output_col_spec, list):
                         if len(result.columns) != len(output_col_spec):
                             _LOGGER.error(f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, but recipe specifies {len(output_col_spec)} output names.")
                             raise ValueError()
                         renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
                         processed_columns.extend(renamed_df.get_columns())
-                    # 2. Handle a string prefix for AutoDummifier
+                    # 3. Global logic for adding a single prefix to all columns.
                     elif isinstance(output_col_spec, str):
                         prefix = output_col_spec
-                        # Replace the original name part with the desired prefix.
-                        new_names = {
-                            col: f"{prefix}{col[len(input_col_name):]}" for col in result.columns
-                        }
+                        new_names = {}
+                        for col in result.columns:
+                            # Case 1: Transformer's output column name contains the input name.
+                            # Action: Replace the input name with the desired prefix.
+                            # Example: input='color', output='color_red', prefix='spec' -> 'spec_red'
+                            if input_col_name in col:
+                                new_names[col] = col.replace(input_col_name, prefix, 1)
+                            # Case 2: Transformer's output is an independent name.
+                            # Action: Prepend the prefix to the output name.
+                            # Example: input='ratio', output='A_B', prefix='spec' -> 'spec_A_B'
+                            else:
+                                new_names[col] = f"{prefix}_{col}"
                         renamed_df = result.rename(new_names)
-                        processed_columns.extend(renamed_df.get_columns())
+                        processed_columns.extend(renamed_df.get_columns())
                     else:
-                        _LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names or a string prefix.")
+                        _LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names, a string prefix, or None.")
                         raise TypeError()
                 else:
@@ -278,7 +299,7 @@ class BinaryTransformer:
             _LOGGER.error("Provide either 'true_keywords' or 'false_keywords', but not both.")
             raise ValueError()
         if true_keywords is None and false_keywords is None:
-            _LOGGER.error("You must provide either 'true_keywords' or 'false_keywords'.")
+            _LOGGER.error("Provide either 'true_keywords' or 'false_keywords'.")
             raise ValueError()
         # --- Configuration ---
@@ -310,16 +331,17 @@ class BinaryTransformer:
         Returns:
             pl.Series: A new Series of type UInt8 containing 1s and 0s.
         """
+        column_base_name = column.name
         # Create a boolean Series: True if any keyword is found, else False
         contains_keyword = column.str.contains(self.pattern)
         # Apply logic and cast directly to integer type
         if self.mode == "true_mode":
             # True -> 1, False -> 0
-            return contains_keyword.cast(pl.UInt8)
+            return contains_keyword.cast(pl.UInt8).alias(column_base_name)
         else: # false_mode
             # We want the inverse: True -> 0, False -> 1
-            return (~contains_keyword).cast(pl.UInt8)
+            return (~contains_keyword).cast(pl.UInt8).alias(column_base_name)
 class AutoDummifier:
@@ -389,11 +411,12 @@ class MultiBinaryDummifier:
         Returns:
             pl.DataFrame: A DataFrame where each column corresponds to a keyword.
         """
+        column_base_name = column.name
         # Ensure the input is treated as a string, preserving nulls
         str_column = column.cast(pl.Utf8)
         output_expressions = []
-        for i, keyword in enumerate(self.keywords):
+        for keyword in self.keywords:
             # Escape keyword to treat it as a literal, not a regex pattern
             base_pattern = re.escape(keyword)
@@ -407,7 +430,7 @@ class MultiBinaryDummifier:
                 .when(str_column.str.contains(pattern))
                 .then(pl.lit(1, dtype=pl.UInt8))
                 .otherwise(pl.lit(0, dtype=pl.UInt8))
-                .alias(f"col_{i}") # Generic name for DataProcessor
+                .alias(f"{column_base_name}_{keyword}") # name for DataProcessor
             )
             output_expressions.append(expr)
@@ -451,6 +474,7 @@ class KeywordDummifier:
         Returns:
             pl.DataFrame: A DataFrame with one-hot encoded columns.
         """
+        column_base_name = column.name
         column = column.cast(pl.Utf8)
         categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None, dtype=pl.Utf8))
@@ -469,22 +493,24 @@ class KeywordDummifier:
                 column.str.contains(pattern)
             ).then(pl.lit(name))
-        categorize_expr = categorize_expr.otherwise(None).alias("category")
+        dummy_name = 'dummy_category'
+        categorize_expr = categorize_expr.otherwise(None).alias(dummy_name)
         temp_df = pl.select(categorize_expr)
-        df_with_dummies = temp_df.to_dummies(columns=["category"])
+        df_with_dummies = temp_df.to_dummies(columns=[dummy_name])
         final_columns = []
         for name in self.group_names:
-            dummy_col_name = f"category_{name}"
+            dummy_col_name = f"{dummy_name}_{name}"
             if dummy_col_name in df_with_dummies.columns:
-                # The alias here uses the group name as the temporary column name
+                # The alias here uses the group name as the final column name
                 final_columns.append(
-                    df_with_dummies.get_column(dummy_col_name).alias(name)
+                    df_with_dummies.get_column(dummy_col_name).alias(f"{column_base_name}_{name}")
                 )
             else:
                 # If a group had no matches, create a column of zeros
-                final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
+                final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(f"{column_base_name}_{name}"))
         return pl.select(final_columns)
@@ -553,6 +579,7 @@ class NumberExtractor:
         Returns:
             pl.Series: A new Series containing the extracted numbers.
         """
+        column_base_name = column.name
         # Extract the first (and only) capturing group
         extracted = column.str.extract(self.regex_pattern, 1)
@@ -563,7 +590,7 @@ class NumberExtractor:
         if self.dtype == "float" and self.round_digits is not None:
             return casted.round(self.round_digits)
-        return casted
+        return casted.alias(column_base_name)
 class MultiNumberExtractor:
@@ -624,6 +651,7 @@ class MultiNumberExtractor:
         """
         Executes the multi-number extraction logic. Preserves nulls from the input column.
         """
+        column_base_name = column.name
         output_expressions = []
         for i in range(self.num_outputs):
             # Define the core extraction logic for the i-th number
@@ -643,7 +671,7 @@ class MultiNumberExtractor:
                 pl.when(column.is_not_null())
                 .then(extraction_expr)
                 .otherwise(None)
-                .alias(f"col_{i}") # Name the final output expression
+                .alias(f"{column_base_name}_{i}") # Name the final output expression
             )
             output_expressions.append(final_expr)
@@ -710,6 +738,7 @@ class TemperatureExtractor:
         Returns:
             pl.Series: A new Series containing the final temperature values as floats.
         """
+        column_base_name = column.name
         # --- Step 1: Extract number(s) to get a Celsius value expression ---
         if self.average_mode:
             # Extract all numbers and compute their mean. Polars' list.mean()
@@ -738,7 +767,7 @@ class TemperatureExtractor:
         # --- Step 3: Round the result and return as a Series ---
         # The select().to_series() pattern is a robust way to execute an
         # expression and guarantee a Series is returned.
-        return pl.select(final_expr.round(2)).to_series()
+        return pl.select(final_expr.round(2)).to_series().alias(column_base_name)
 class MultiTemperatureExtractor:
@@ -799,6 +828,7 @@ class MultiTemperatureExtractor:
         """
         Applies the multi-temperature extraction and conversion logic.
         """
+        column_base_name = column.name
         output_expressions = []
         for i in range(self.num_outputs):
             # --- Step 1: Extract the i-th number as a Celsius value ---
@@ -829,7 +859,7 @@ class MultiTemperatureExtractor:
                 pl.when(column.is_not_null())
                 .then(final_expr)
                 .otherwise(None)
-                .alias(f"col_{i}") # Temporary name for DataProcessor
+                .alias(f"{column_base_name}_{i}") # Temporary name for DataProcessor
             )
             output_expressions.append(final_expr)
@@ -871,6 +901,7 @@ class RatioCalculator:
         """
         Applies the ratio calculation logic to the input column. Uses .str.extract() for maximum stability and includes optional handling for zeros and single numbers.
         """
+        column_base_name = column.name
         # Extract numerator (group 1) and denominator (group 2) separately.
         numerator_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
         denominator_expr = column.str.extract(self.regex_pattern, 2).cast(pl.Float64, strict=False)
@@ -908,7 +939,7 @@ class RatioCalculator:
         else:
             final_expr = ratio_expr
-        return pl.select(final_expr.round(4)).to_series()
+        return pl.select(final_expr.round(4)).to_series().alias(column_base_name)
 class TriRatioCalculator:
@@ -949,6 +980,7 @@ class TriRatioCalculator:
         """
         Applies the robust tri-ratio logic using the lazy API.
         """
+        column_base_name = column.name
         # Wrap the input Series in a DataFrame to use the lazy expression API
         temp_df = column.to_frame()
@@ -973,8 +1005,8 @@ class TriRatioCalculator:
         # Execute the expressions and return the final DataFrame
         return temp_df.select(
-            A_div_B=ratio_ab_expr,
-            A_div_C=ratio_ac_expr
+            ratio_ab_expr.alias(f"{column_base_name}_A_to_B"),
+            ratio_ac_expr.alias(f"{column_base_name}_A_to_C")
         )
@@ -1015,6 +1047,7 @@ class CategoryMapper:
         Returns:
             pl.Series: A new Series with categories mapped to numbers.
         """
+        column_base_name = column.name
         # Ensure the column is treated as a string for matching keys
         str_column = column.cast(pl.Utf8)
@@ -1031,7 +1064,7 @@ class CategoryMapper:
             pl.lit(self.default_value)
         )
-        return pl.select(final_expr).to_series()
+        return pl.select(final_expr).to_series().alias(column_base_name)
 class RegexMapper:
@@ -1095,6 +1128,7 @@ class RegexMapper:
             pl.Series: A new Series with strings mapped to numbers based on
                        the first matching regex pattern.
         """
+        column_base_name = column.name
         # pl.String is the modern alias for pl.Utf8
         str_column = column.cast(pl.String)
@@ -1109,7 +1143,7 @@ class RegexMapper:
                 .otherwise(mapping_expr)
             )
-        return pl.select(mapping_expr).to_series()
+        return pl.select(mapping_expr).to_series().alias(column_base_name)
 class ValueBinner:
@@ -1159,6 +1193,7 @@ class ValueBinner:
             pl.Series: A new Series of integer labels for the bins. Values
                        outside the specified breaks will become null.
         """
+        column_base_name = column.name
         # `cut` creates a new column of type Categorical
         binned_column = column.cut(
             breaks=self.breaks,
@@ -1168,7 +1203,7 @@ class ValueBinner:
         # to_physical() converts the Categorical type to its underlying
         # integer representation (u32), which is perfect for ML.
-        return binned_column.to_physical()
+        return binned_column.to_physical().alias(column_base_name)
 class DateFeatureExtractor:
@@ -1177,16 +1212,6 @@ class DateFeatureExtractor:
     It can handle columns that are already in a Polars Date/Datetime format,
     or it can parse string columns if a format is provided.
-    Args:
-        features (List[str]):
-            A list of the date/time features to extract. Supported features are:
-            'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
-            'microsecond', 'nanosecond', 'ordinal_day' (day of year),
-            'weekday' (Mon=1, Sun=7), 'week' (week of year), and 'timestamp'.
-        format (str | None):
-            The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
-            Use if the input column is not a Date or Datetime type.
     """
     ALLOWED_FEATURES = {
@@ -1199,6 +1224,17 @@ class DateFeatureExtractor:
         features: List[str],
         format: Optional[str] = None,
     ):
+        """
+        Args:
+            features (List[str]):
+                A list of the date/time features to extract. Supported features are:
+                'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
+                'microsecond', 'nanosecond', 'ordinal_day' (day of year),
+                'weekday' (Mon=1, Sun=7), 'week' (week of year), 'timestamp'.
+            format (str | None):
+                The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
+                Use if the input column is not a Date or Datetime type.
+        """
         # --- Validation ---
         if not isinstance(features, list) or not features:
             _LOGGER.error("'features' must be a non-empty list of strings.")
@@ -1222,6 +1258,7 @@ class DateFeatureExtractor:
         Returns:
             pl.DataFrame: A DataFrame with columns for each extracted feature.
         """
+        column_base_name = column.name
         date_col = column
         # First, parse strings into a datetime object if a format is given
         if self.format is not None:
@@ -1237,10 +1274,81 @@ class DateFeatureExtractor:
                 expr = getattr(date_col.dt, feature)()
             # Alias with a generic name for the processor to handle
-            output_expressions.append(expr.alias(f"col_{i}"))
+            output_expressions.append(expr.alias(f"{column_base_name}_{feature}"))
         return pl.select(output_expressions)
+class MolecularFormulaTransformer:
+    """
+    Parses a Polars Series of molecular formula strings into a wide DataFrame.
+    This one-to-many transformer takes a column of condensed molecular formulas
+    (e.g., 'Li0.115Mn0.529Ni0.339O2') and converts it into a DataFrame where
+    each chemical element has its own column. The value in each column is the
+    stoichiometric quantity of that element.
+    It is designed to be used within the DataProcessor pipeline.
+    """
+    def __init__(self):
+        """
+        Initializes the transformer and pre-compiles the regex pattern.
+        """
+        # Sort symbols by length to prevent matching 'C' in 'Co'
+        sorted_symbols = sorted(CHEMICAL_ELEMENT_SYMBOLS, key=len, reverse=True)
+        # Pre-compile regex for efficiency
+        self.pattern = re.compile(rf'({"|".join(sorted_symbols)})(\d*\.?\d*)')
+    def __call__(self, column: pl.Series) -> pl.DataFrame:
+        """
+        Executes the formula parsing logic.
+        Args:
+            column: A Polars Series containing strings of molecular formulas.
+        Returns:
+            A Polars DataFrame with columns for every chemical element.
+        """
+        column_base_name = column.name
+        def parse_formula(formula: str) -> dict:
+            """Helper to parse a single formula string into a dictionary."""
+            if not isinstance(formula, str) or not formula:
+                return {}
+            matches = self.pattern.findall(formula)
+            # This dict comprehension is correct for your use case where
+            # each element appears only once in the formula string.
+            return {
+                element: float(value) if value else 1.0
+                for element, value in matches
+            }
+        # Apply the parsing function to each element
+        parsed_series = column.map_elements(parse_formula, return_dtype=pl.Object)
+        # Convert the Series of dictionaries into a DataFrame
+        df = pl.DataFrame(parsed_series.to_list())
+        # Ensure all possible element columns are created, filling with 0
+        select_expressions = []
+        for symbol in CHEMICAL_ELEMENT_SYMBOLS:
+            col_name = f"{column_base_name}_{symbol}"
+            if symbol in df.columns:
+                expr = pl.col(symbol).fill_null(0).alias(col_name)
+            else:
+                expr = pl.lit(0.0, dtype=pl.Float64).alias(col_name)
+            select_expressions.append(expr)
+        # Handle edge case where input series is not empty but parsing yields no rows
+        base_df = df
+        if df.height == 0 and column.len() > 0:
+            base_df = pl.DataFrame({'dummy': range(column.len())})
+        return base_df.select(select_expressions)
 def info():
     _script_info(__all__)

ml_tools/constants.py ADDED Viewed

@@ -0,0 +1,79 @@
+CHEMICAL_ELEMENTS = [
+    "Hydrogen", "Helium", "Lithium", "Beryllium", "Boron", "Carbon", "Nitrogen", "Oxygen", "Fluorine", "Neon",
+    "Sodium", "Magnesium", "Aluminum", "Silicon", "Phosphorus", "Sulfur", "Chlorine", "Argon",
+    "Potassium", "Calcium", "Scandium", "Titanium", "Vanadium", "Chromium", "Manganese", "Iron", "Cobalt", "Nickel", "Copper", "Zinc",
+    "Gallium", "Germanium", "Arsenic", "Selenium", "Bromine", "Krypton",
+    "Rubidium", "Strontium", "Yttrium", "Zirconium", "Niobium", "Molybdenum", "Technetium", "Ruthenium", "Rhodium", "Palladium", "Silver", "Cadmium",
+    "Indium", "Tin", "Antimony", "Tellurium", "Iodine", "Xenon",
+    "Cesium", "Barium", "Lanthanum", "Cerium", "Praseodymium", "Neodymium", "Promethium", "Samarium", "Europium", "Gadolinium", "Terbium", "Dysprosium", "Holmium", "Erbium", "Thulium", "Ytterbium", "Lutetium",
+    "Hafnium", "Tantalum", "Tungsten", "Rhenium", "Osmium", "Iridium", "Platinum", "Gold", "Mercury",
+    "Thallium", "Lead", "Bismuth", "Polonium", "Astatine", "Radon",
+    "Francium", "Radium", "Actinium", "Thorium", "Protactinium", "Uranium", "Neptunium", "Plutonium", "Americium", "Curium", "Berkelium", "Californium", "Einsteinium", "Fermium", "Mendelevium", "Nobelium", "Lawrencium",
+    "Rutherfordium", "Dubnium", "Seaborgium", "Bohrium", "Hassium", "Meitnerium", "Darmstadtium", "Roentgenium", "Copernicium", "Nihonium", "Flerovium", "Moscovium", "Livermorium", "Tennessine", "Oganesson"
+]
+CHEMICAL_ELEMENT_SYMBOLS = [
+    "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
+    "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
+    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
+    "Ga", "Ge", "As", "Se", "Br", "Kr",
+    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd",
+    "In", "Sn", "Sb", "Te", "I", "Xe",
+    "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
+    "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
+    "Tl", "Pb", "Bi", "Po", "At", "Rn",
+    "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
+    "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
+]
+# --- Physics & Chemistry ---
+# Speed of light in vacuum (m/s)
+SPEED_OF_LIGHT = 299792458.0
+# Planck constant (J·s)
+PLANCK_CONSTANT = 6.62607015e-34
+# Avogadro's number (mol⁻¹)
+AVOGADRO_NUMBER = 6.02214076e23
+# Universal gas constant (J/(mol·K))
+UNIVERSAL_GAS_CONSTANT = 8.314462618
+# Boltzmann constant (J/K)
+BOLTZMANN_CONSTANT = 1.380649e-23
+# Gravitational constant (m³·kg⁻¹·s⁻²)
+GRAVITATIONAL_CONSTANT = 6.67430e-11
+# Standard acceleration of gravity on Earth (m/s²)
+STANDARD_GRAVITY = 9.80665
+# Elementary charge (C)
+ELEMENTARY_CHARGE = 1.602176634e-19
+# Electron mass (kg)
+ELECTRON_MASS_KG = 9.1093837015e-31
+# Proton mass (kg)
+PROTON_MASS_KG = 1.67262192369e-27
+# Absolute zero (in Celsius)
+ABSOLUTE_ZERO_CELSIUS = -273.15
+# --- Astronomy ---
+# Astronomical Unit, the mean Earth-Sun distance (meters)
+ASTRONOMICAL_UNIT_KM = 149597870.7
+# Light-year (meters)
+LIGHT_YEAR_KM = 9460730472580.8
+# Earth's equatorial radius (meters)
+EARTH_RADIUS_KM = 6378.137
+# Mass of the Earth (kg)
+EARTH_MASS_KG = 5.9722e24
+# Mass of the Sun (kg)
+SUN_MASS_KG = 1.98847e30

{dragon_ml_toolbox-10.15.0.dist-info → dragon_ml_toolbox-11.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.15.0.dist-info → dragon_ml_toolbox-11.1.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.15.0.dist-info → dragon_ml_toolbox-11.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.15.0.dist-info → dragon_ml_toolbox-11.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 10.15.0__py3-none-any.whl → 11.1.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 10.15.0py3-none-any.whl → 11.1.0py3-none-any.whl