PyPI - dragon-ml-toolbox - Versions diffs - 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl - Mend

dragon-ml-toolbox 19.14.0py3-none-any.whl → 20.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

{dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
ml_tools/ETL_cleaning/_basic_clean.py +351 -0
ml_tools/ETL_cleaning/_clean_tools.py +128 -0
ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
ml_tools/ETL_cleaning/_imprimir.py +13 -0
ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
ml_tools/ETL_engineering/_imprimir.py +24 -0
ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
ml_tools/GUI_tools/_imprimir.py +12 -0
ml_tools/IO_tools/_IO_loggers.py +235 -0
ml_tools/IO_tools/_IO_save_load.py +151 -0
ml_tools/IO_tools/_IO_utils.py +140 -0
ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
ml_tools/IO_tools/_imprimir.py +14 -0
ml_tools/MICE/_MICE_imputation.py +132 -0
ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
ml_tools/MICE/_imprimir.py +11 -0
ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
ml_tools/ML_callbacks/_base.py +101 -0
ml_tools/ML_callbacks/_checkpoint.py +232 -0
ml_tools/ML_callbacks/_early_stop.py +208 -0
ml_tools/ML_callbacks/_imprimir.py +12 -0
ml_tools/ML_callbacks/_scheduler.py +197 -0
ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
ml_tools/ML_chain/_dragon_chain.py +140 -0
ml_tools/ML_chain/_imprimir.py +11 -0
ml_tools/ML_configuration/__init__.py +90 -0
ml_tools/ML_configuration/_base_model_config.py +69 -0
ml_tools/ML_configuration/_finalize.py +366 -0
ml_tools/ML_configuration/_imprimir.py +47 -0
ml_tools/ML_configuration/_metrics.py +593 -0
ml_tools/ML_configuration/_models.py +206 -0
ml_tools/ML_configuration/_training.py +124 -0
ml_tools/ML_datasetmaster/__init__.py +28 -0
ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
ml_tools/ML_datasetmaster/_imprimir.py +15 -0
ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
ml_tools/ML_evaluation/__init__.py +53 -0
ml_tools/ML_evaluation/_classification.py +629 -0
ml_tools/ML_evaluation/_feature_importance.py +409 -0
ml_tools/ML_evaluation/_imprimir.py +25 -0
ml_tools/ML_evaluation/_loss.py +92 -0
ml_tools/ML_evaluation/_regression.py +273 -0
ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
ml_tools/ML_finalize_handler/__init__.py +10 -0
ml_tools/ML_finalize_handler/_imprimir.py +8 -0
ml_tools/ML_inference/__init__.py +22 -0
ml_tools/ML_inference/_base_inference.py +166 -0
ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
ml_tools/ML_inference/_dragon_inference.py +332 -0
ml_tools/ML_inference/_imprimir.py +11 -0
ml_tools/ML_inference/_multi_inference.py +180 -0
ml_tools/ML_inference_sequence/__init__.py +10 -0
ml_tools/ML_inference_sequence/_imprimir.py +8 -0
ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
ml_tools/ML_inference_vision/__init__.py +10 -0
ml_tools/ML_inference_vision/_imprimir.py +8 -0
ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
ml_tools/ML_models/__init__.py +32 -0
ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
ml_tools/ML_models/_base_mlp_attention.py +198 -0
ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
ml_tools/ML_models/_dragon_tabular.py +248 -0
ml_tools/ML_models/_imprimir.py +18 -0
ml_tools/ML_models/_mlp_attention.py +134 -0
ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
ml_tools/ML_models_sequence/__init__.py +10 -0
ml_tools/ML_models_sequence/_imprimir.py +8 -0
ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
ml_tools/ML_models_vision/__init__.py +29 -0
ml_tools/ML_models_vision/_base_wrapper.py +254 -0
ml_tools/ML_models_vision/_image_classification.py +182 -0
ml_tools/ML_models_vision/_image_segmentation.py +108 -0
ml_tools/ML_models_vision/_imprimir.py +16 -0
ml_tools/ML_models_vision/_object_detection.py +135 -0
ml_tools/ML_optimization/__init__.py +21 -0
ml_tools/ML_optimization/_imprimir.py +13 -0
ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
ml_tools/ML_optimization/_single_dragon.py +203 -0
ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
ml_tools/ML_scaler/__init__.py +10 -0
ml_tools/ML_scaler/_imprimir.py +8 -0
ml_tools/ML_trainer/__init__.py +20 -0
ml_tools/ML_trainer/_base_trainer.py +297 -0
ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
ml_tools/ML_trainer/_imprimir.py +10 -0
ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
ml_tools/ML_utilities/_artifact_finder.py +382 -0
ml_tools/ML_utilities/_imprimir.py +16 -0
ml_tools/ML_utilities/_inspection.py +325 -0
ml_tools/ML_utilities/_train_tools.py +205 -0
ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
ml_tools/ML_vision_transformers/_imprimir.py +14 -0
ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
ml_tools/PSO_optimization/_imprimir.py +10 -0
ml_tools/SQL/__init__.py +7 -0
ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
ml_tools/SQL/_imprimir.py +8 -0
ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
ml_tools/VIF/_imprimir.py +10 -0
ml_tools/_core/__init__.py +7 -1
ml_tools/_core/_logger.py +8 -18
ml_tools/_core/_schema_load_ops.py +43 -0
ml_tools/_core/_script_info.py +2 -2
ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
ml_tools/data_exploration/_analysis.py +214 -0
ml_tools/data_exploration/_cleaning.py +566 -0
ml_tools/data_exploration/_features.py +583 -0
ml_tools/data_exploration/_imprimir.py +32 -0
ml_tools/data_exploration/_plotting.py +487 -0
ml_tools/data_exploration/_schema_ops.py +176 -0
ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
ml_tools/ensemble_evaluation/_imprimir.py +14 -0
ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
ml_tools/ensemble_inference/_imprimir.py +9 -0
ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
ml_tools/ensemble_learning/_imprimir.py +10 -0
ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
ml_tools/excel_handler/_imprimir.py +13 -0
ml_tools/{keys.py → keys/__init__.py} +4 -1
ml_tools/keys/_imprimir.py +11 -0
ml_tools/{_core → keys}/_keys.py +2 -0
ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
ml_tools/math_utilities/_imprimir.py +11 -0
ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
ml_tools/optimization_tools/_imprimir.py +13 -0
ml_tools/optimization_tools/_optimization_bounds.py +236 -0
ml_tools/optimization_tools/_optimization_plots.py +218 -0
ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
ml_tools/path_manager/_imprimir.py +15 -0
ml_tools/path_manager/_path_tools.py +346 -0
ml_tools/plot_fonts/__init__.py +8 -0
ml_tools/plot_fonts/_imprimir.py +8 -0
ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
ml_tools/schema/__init__.py +15 -0
ml_tools/schema/_feature_schema.py +223 -0
ml_tools/schema/_gui_schema.py +191 -0
ml_tools/schema/_imprimir.py +10 -0
ml_tools/{serde.py → serde/__init__.py} +4 -2
ml_tools/serde/_imprimir.py +10 -0
ml_tools/{_core → serde}/_serde.py +3 -8
ml_tools/{utilities.py → utilities/__init__.py} +11 -6
ml_tools/utilities/_imprimir.py +18 -0
ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
ml_tools/utilities/_utility_tools.py +192 -0
dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
ml_tools/ML_chaining_inference.py +0 -8
ml_tools/ML_configuration.py +0 -86
ml_tools/ML_configuration_pytab.py +0 -14
ml_tools/ML_datasetmaster.py +0 -10
ml_tools/ML_evaluation.py +0 -16
ml_tools/ML_evaluation_multi.py +0 -12
ml_tools/ML_finalize_handler.py +0 -8
ml_tools/ML_inference.py +0 -12
ml_tools/ML_models.py +0 -14
ml_tools/ML_models_advanced.py +0 -14
ml_tools/ML_models_pytab.py +0 -14
ml_tools/ML_optimization.py +0 -14
ml_tools/ML_optimization_pareto.py +0 -8
ml_tools/ML_scaler.py +0 -8
ml_tools/ML_sequence_datasetmaster.py +0 -8
ml_tools/ML_sequence_evaluation.py +0 -10
ml_tools/ML_sequence_inference.py +0 -8
ml_tools/ML_sequence_models.py +0 -8
ml_tools/ML_trainer.py +0 -12
ml_tools/ML_vision_datasetmaster.py +0 -12
ml_tools/ML_vision_evaluation.py +0 -10
ml_tools/ML_vision_inference.py +0 -8
ml_tools/ML_vision_models.py +0 -18
ml_tools/SQL.py +0 -8
ml_tools/_core/_ETL_cleaning.py +0 -694
ml_tools/_core/_IO_tools.py +0 -498
ml_tools/_core/_ML_callbacks.py +0 -702
ml_tools/_core/_ML_configuration.py +0 -1332
ml_tools/_core/_ML_configuration_pytab.py +0 -102
ml_tools/_core/_ML_evaluation.py +0 -867
ml_tools/_core/_ML_evaluation_multi.py +0 -544
ml_tools/_core/_ML_inference.py +0 -646
ml_tools/_core/_ML_models.py +0 -668
ml_tools/_core/_ML_models_pytab.py +0 -693
ml_tools/_core/_ML_trainer.py +0 -2323
ml_tools/_core/_ML_utilities.py +0 -886
ml_tools/_core/_ML_vision_models.py +0 -644
ml_tools/_core/_data_exploration.py +0 -1909
ml_tools/_core/_optimization_tools.py +0 -493
ml_tools/_core/_schema.py +0 -359
ml_tools/plot_fonts.py +0 -8
ml_tools/schema.py +0 -12
{dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0

ml_tools/ETL_cleaning/_dragon_cleaner.py ADDED Viewed

@@ -0,0 +1,245 @@
+import polars as pl
+from pathlib import Path
+from typing import Union
+from ..utilities import save_dataframe_filename, load_dataframe
+from .._core import get_logger
+from ..path_manager import make_fullpath
+from ._clean_tools import save_unique_values
+_LOGGER = get_logger("DragonCleaner")
+__all__ = [
+    "DragonColumnCleaner",
+    "DragonDataFrameCleaner",
+]
+class DragonColumnCleaner:
+    """
+    A configuration object that defines cleaning rules for a single Polars DataFrame column.
+    This class holds a dictionary of regex-to-replacement rules, the target column name,
+    and the case-sensitivity setting. It is intended to be used with the DragonDataFrameCleaner.
+    Notes:
+        - Define rules from most specific to more general to create a fallback system.
+        - Beware of chain replacements (rules matching strings that have already been
+          changed by a previous rule in the same cleaner).
+    """
+    def __init__(self,
+                 column_name: str,
+                 rules: Union[dict[str, Union[str, None]], dict[str, str]],
+                 case_insensitive: bool = False):
+        """
+        Args:
+            column_name (str):
+                The name of the column to be cleaned.
+            rules (Dict[str, str | None]):
+                A dictionary of regex patterns to replacement strings.
+                - Replacement can be None to indicate that matching values should be converted to null.
+                - Can use backreferences (e.g., r'$1 $2') for captured groups. Note that Polars uses a '$' prefix for backreferences.
+            case_insensitive (bool):
+                If True, regex matching ignores case.
+        ## Usage Example
+        ```python
+        id_rules = {
+            # Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
+            r'ID[- ](\\d+)': r'ID:$1'
+        }
+        id_cleaner = DragonColumnCleaner(column_name='user_id', rules=id_rules)
+        # This object would then be passed to a DragonDataFrameCleaner.
+        ```
+        """
+        if not isinstance(column_name, str) or not column_name:
+            _LOGGER.error("The 'column_name' must be a non-empty string.")
+            raise TypeError()
+        if not isinstance(rules, dict):
+            _LOGGER.error("The 'rules' argument must be a dictionary.")
+            raise TypeError()
+        # validate rules
+        for pattern, replacement in rules.items():
+            if not isinstance(pattern, str):
+                _LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
+                raise TypeError()
+            if replacement is not None and not isinstance(replacement, str):
+                _LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
+                raise TypeError()
+        self.column_name = column_name
+        self.rules = rules
+        self.case_insensitive = case_insensitive
+    def preview(self,
+                csv_path: Union[str, Path],
+                report_dir: Union[str, Path],
+                add_value_separator: bool=False,
+                rule_batch_size: int = 150):
+        """
+        Generates a preview report of unique values in the specified column after applying the current cleaning rules.
+        Args:
+            csv_path (str | Path):
+                The path to the CSV file containing the data to clean.
+            report_dir (str | Path):
+                The directory where the preview report will be saved.
+            add_value_separator (bool):
+                If True, adds a separator line between each unique value in the report.
+            rule_batch_size (int):
+                Splits the regex rules into chunks of this size. Helps prevent memory errors.
+        """
+        # Load DataFrame
+        df, _ = load_dataframe(df_path=csv_path, use_columns=[self.column_name], kind="polars", all_strings=True)
+        preview_cleaner = DragonDataFrameCleaner(cleaners=[self])
+        df_preview = preview_cleaner.clean(df, rule_batch_size=rule_batch_size)
+        # Apply cleaning rules to a copy of the column for preview
+        save_unique_values(csv_path_or_df=df_preview,
+                           output_dir=report_dir,
+                           use_columns=[self.column_name],
+                           verbose=False,
+                           keep_column_order=False,
+                           add_value_separator=add_value_separator)
+class DragonDataFrameCleaner:
+    """
+    Orchestrates cleaning multiple columns in a Polars DataFrame.
+    """
+    def __init__(self, cleaners: list[DragonColumnCleaner]):
+        """
+        Takes a list of DragonColumnCleaner objects and applies their defined
+        rules to the corresponding columns of a DataFrame using high-performance
+        Polars expressions wit memory optimization.
+        Args:
+            cleaners (List[DragonColumnCleaner]):
+                A list of DragonColumnCleaner configuration objects.
+        """
+        if not isinstance(cleaners, list):
+            _LOGGER.error("The 'cleaners' argument must be a list of DragonColumnCleaner objects.")
+            raise TypeError()
+        seen_columns = set()
+        for cleaner in cleaners:
+            if not isinstance(cleaner, DragonColumnCleaner):
+                _LOGGER.error(f"All items in 'cleaners' list must be DragonColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
+                raise TypeError()
+            if cleaner.column_name in seen_columns:
+                _LOGGER.error(f"Duplicate DragonColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
+                raise ValueError()
+            seen_columns.add(cleaner.column_name)
+        self.cleaners = cleaners
+    def clean(self, df: Union[pl.DataFrame, pl.LazyFrame],
+              rule_batch_size: int = 150) -> pl.DataFrame:
+        """
+        Applies cleaning rules. Supports Lazy execution to handle OOM issues.
+        Args:
+            df (pl.DataFrame | pl.LazyFrame):
+                The data to clean.
+            rule_batch_size (int):
+                Splits the regex rules into chunks of this size. Helps prevent memory errors.
+        Returns:
+            pl.DataFrame: The cleaned, collected DataFrame.
+        """
+        # 1. Validate Columns (Only if eager, or simple schema check if lazy)
+        # Note: For LazyFrames, we assume columns exist or let it fail at collection.
+        if isinstance(df, pl.DataFrame):
+            df_cols = set(df.columns)
+            rule_cols = {c.column_name for c in self.cleaners}
+            missing = rule_cols - df_cols
+            if missing:
+                _LOGGER.error(f"The following columns specified in cleaners are missing from the DataFrame: {missing}")
+                raise ValueError()
+            # lazy internally
+            lf = df.lazy()
+        else:
+            # It should be a LazyFrame, check type
+            if not isinstance(df, pl.LazyFrame):
+                _LOGGER.error("The 'df' argument must be a Polars DataFrame or LazyFrame.")
+                raise TypeError()
+            # It is already a LazyFrame
+            lf = df
+        # 2. Build Expression Chain
+        final_lf = lf
+        for cleaner in self.cleaners:
+            col_name = cleaner.column_name
+            # Get all rules as a list of items
+            all_rules = list(cleaner.rules.items())
+            # Process in batches of 'rule_batch_size'
+            for i in range(0, len(all_rules), rule_batch_size):
+                rule_batch = all_rules[i : i + rule_batch_size]
+                # Start expression for this batch
+                col_expr = pl.col(col_name).cast(pl.String)
+                for pattern, replacement in rule_batch:
+                    final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
+                    if replacement is None:
+                        col_expr = pl.when(col_expr.str.contains(final_pattern)) \
+                                    .then(None) \
+                                    .otherwise(col_expr)
+                    else:
+                        col_expr = col_expr.str.replace_all(final_pattern, replacement)
+                # Apply this batch of rules to the LazyFrame
+                final_lf = final_lf.with_columns(col_expr.alias(col_name))
+        # 3. Collect Results
+        try:
+            return final_lf.collect(engine="streaming")
+        except Exception as e:
+            _LOGGER.error("An error occurred during the cleaning process.")
+            raise e
+    def load_clean_save(self,
+                        input_filepath: Union[str,Path],
+                        output_filepath: Union[str,Path],
+                        rule_batch_size: int = 150):
+        """
+        This convenience method encapsulates the entire cleaning process into a
+        single call. It loads a DataFrame from a specified file, applies all
+        cleaning rules configured in the `DragonDataFrameCleaner` instance, and saves
+        the resulting cleaned DataFrame to a new file.
+        The method ensures that all data is loaded as string types to prevent
+        unintended type inference issues before cleaning operations are applied.
+        Args:
+            input_filepath (Union[str, Path]):
+                The path to the input data file.
+            output_filepath (Union[str, Path]):
+                The full path, where the cleaned data file will be saved.
+            rule_batch_size (int):
+                Splits the regex rules into chunks of this size. Helps prevent memory errors.
+        """
+        df, _ = load_dataframe(df_path=input_filepath, kind="polars", all_strings=True)
+        df_clean = self.clean(df=df, rule_batch_size=rule_batch_size)
+        if isinstance(output_filepath, str):
+            output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
+        save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
+        return None

ml_tools/ETL_cleaning/_imprimir.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .._core import _imprimir_disponibles
+_GRUPOS = [
+    "DragonColumnCleaner",
+    "DragonDataFrameCleaner",
+    "save_unique_values",
+    "basic_clean",
+    "basic_clean_drop",
+    "drop_macro_polars",
+]
+def info():
+    _imprimir_disponibles(_GRUPOS)

ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} RENAMED Viewed

@@ -1,6 +1,9 @@
-from ._core._ETL_engineering import (
-    DragonTransformRecipe,
+from ._dragon_engineering import (
     DragonProcessor,
+    DragonTransformRecipe,
+)
+from ._transforms import (
     BinaryTransformer,
     MultiBinaryDummifier,
     AutoDummifier,
@@ -15,10 +18,11 @@ from ._core._ETL_engineering import (
     RegexMapper,
     ValueBinner,
     DateFeatureExtractor,
-    MolecularFormulaTransformer,
-    info
+    MolecularFormulaTransformer
 )
+from ._imprimir import info
 __all__ = [
     "DragonTransformRecipe",

ml_tools/ETL_engineering/_dragon_engineering.py ADDED Viewed

@@ -0,0 +1,261 @@
+import polars as pl
+from pathlib import Path
+from typing import Union, Optional, Any, Callable
+from ..utilities import load_dataframe, save_dataframe_filename
+from ..keys._keys import MagicWords
+from ..path_manager import make_fullpath
+from .._core import get_logger
+_LOGGER = get_logger("DragonTransform")
+__all__ = [
+    "DragonTransformRecipe",
+    "DragonProcessor",
+]
+class DragonTransformRecipe:
+    """
+    A builder class for creating a data transformation recipe.
+    This class provides a structured way to define a series of transformation
+    steps, with validation performed at the time of addition. It is designed
+    to be passed to a `DragonProcessor`.
+    Use the method `add()` to add recipes.
+    """
+    def __init__(self):
+        self._steps: list[dict[str, Any]] = []
+    def add(
+        self,
+        input_col_name: str,
+        transform: Union[str, Callable],
+        output_col_names: Optional[Union[str, list[str]]] = None
+    ) -> "DragonTransformRecipe":
+        """
+        Adds a new transformation step to the recipe.
+        Args:
+            input_col_name: The name of the column from the source DataFrame.
+            output_col_names: The desired name(s) for the output column(s).
+                        - A string for a 1-to-1 mapping.
+                        - A list of strings for a 1-to-many mapping.
+                        - A string prefix for 1-to-many mapping.
+                        - If None, the input name is used for 1-to-1 transforms,
+                          or the transformer's default names are used for 1-to-many.
+            transform: The transformation to apply:
+                - Use "rename" for simple column renaming
+                - If callable, must accept a `pl.Series` as the only parameter and return either a `pl.Series` or `pl.DataFrame`.
+        Returns:
+            The instance of the recipe itself to allow for method chaining.
+        """
+        # --- Validation ---
+        if not isinstance(input_col_name, str) or not input_col_name:
+            _LOGGER.error("'input_col' must be a non-empty string.")
+            raise TypeError()
+        if transform == MagicWords.RENAME:
+            if not isinstance(output_col_names, str):
+                _LOGGER.error("For a RENAME operation, 'output_col' must be a string.")
+                raise TypeError()
+        elif not isinstance(transform, Callable):
+            _LOGGER.error(f"'transform' must be a callable function or the string '{MagicWords.RENAME}'.")
+            raise TypeError()
+        # --- Add Step ---
+        step = {
+            "input_col": input_col_name,
+            "output_col": output_col_names,
+            "transform": transform,
+        }
+        self._steps.append(step)
+        return self  # Allow chaining: recipe.add(...).add(...)
+    def __iter__(self):
+        """Allows the class to be iterated over, like a list."""
+        return iter(self._steps)
+    def __len__(self):
+        """Allows the len() function to be used on an instance."""
+        return len(self._steps)
+class DragonProcessor:
+    """
+    Transforms a Polars DataFrame based on a provided `DragonTransformRecipe` object.
+    Use the methods `transform()` or `load_transform_save()`.
+    """
+    def __init__(self, recipe: DragonTransformRecipe):
+        """
+        Initializes the DragonProcessor with a transformation recipe.
+        Args:
+            recipe: An instance of the `DragonTransformRecipe` class that has
+                    been populated with transformation steps.
+        """
+        if not isinstance(recipe, DragonTransformRecipe):
+            _LOGGER.error("The recipe must be an instance of DragonTransformRecipe.")
+            raise TypeError()
+        if len(recipe) == 0:
+            _LOGGER.error("The recipe cannot be empty.")
+            raise ValueError()
+        self._recipe = recipe
+    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
+        """
+        Applies the transformation recipe to the input DataFrame.
+        """
+        processed_columns = []
+        # Recipe object is iterable
+        for step in self._recipe:
+            input_col_name = step["input_col"]
+            output_col_spec = step["output_col"]
+            transform_action = step["transform"]
+            if input_col_name not in df.columns:
+                _LOGGER.error(f"Input column '{input_col_name}' not found in DataFrame.")
+                raise ValueError()
+            input_series = df.get_column(input_col_name)
+            if transform_action == MagicWords.RENAME:
+                processed_columns.append(input_series.alias(output_col_spec))
+                continue
+            if isinstance(transform_action, Callable):
+                result = transform_action(input_series)
+                if isinstance(result, pl.Series):
+                    # Default to input name if spec is None
+                    output_name = output_col_spec if output_col_spec is not None else input_col_name
+                    if not isinstance(output_name, str):
+                        _LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' must be a string or None.")
+                        raise TypeError()
+                    processed_columns.append(result.alias(output_name))
+                elif isinstance(result, pl.DataFrame):
+                    # 1. Handle None in output names
+                    if output_col_spec is None:
+                        # Use the column names generated by the transformer directly
+                        processed_columns.extend(result.get_columns())
+                    # 2. Handle list-based renaming
+                    elif isinstance(output_col_spec, list):
+                        if len(result.columns) != len(output_col_spec):
+                            _LOGGER.error(f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, but recipe specifies {len(output_col_spec)} output names.")
+                            raise ValueError()
+                        renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
+                        processed_columns.extend(renamed_df.get_columns())
+                    # 3. Global logic for adding a single prefix to all columns.
+                    elif isinstance(output_col_spec, str):
+                        prefix = output_col_spec
+                        new_names = {}
+                        for col in result.columns:
+                            # Case 1: Transformer's output column name contains the input name.
+                            # Action: Replace the input name with the desired prefix.
+                            # Example: input='color', output='color_red', prefix='spec' -> 'spec_red'
+                            # if input_col_name in col:
+                            if col.startswith(input_col_name):
+                                new_names[col] = col.replace(input_col_name, prefix, 1)
+                            # Case 2: Transformer's output is an independent name.
+                            # Action: Prepend the prefix to the output name.
+                            # Example: input='ratio', output='A_B', prefix='spec' -> 'spec_A_B'
+                            else:
+                                new_names[col] = f"{prefix}_{col}"
+                        renamed_df = result.rename(new_names)
+                        processed_columns.extend(renamed_df.get_columns())
+                    else:
+                        _LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names, a string prefix, or None.")
+                        raise TypeError()
+                else:
+                    _LOGGER.error(f"Function for '{input_col_name}' returned an unexpected type: {type(result)}.")
+                    raise TypeError()
+            else: # This case is unlikely due to builder validation.
+                _LOGGER.error(f"Invalid 'transform' action for '{input_col_name}': {transform_action}")
+                raise TypeError()
+        if not processed_columns:
+            _LOGGER.error("The transformation resulted in an empty DataFrame.")
+            return pl.DataFrame()
+        _LOGGER.info(f"Processed dataframe with {len(processed_columns)} columns.")
+        return pl.DataFrame(processed_columns)
+    def load_transform_save(self, input_path: Union[str,Path], output_path: Union[str,Path]):
+        """
+        Convenience wrapper for the transform method that includes automatic dataframe loading and saving.
+        """
+        # Validate paths
+        in_path = make_fullpath(input_path, enforce="file")
+        out_path = make_fullpath(output_path, make=True, enforce="file")
+        # load df
+        df, _ = load_dataframe(df_path=in_path, kind="polars", all_strings=True)
+        # Process
+        df_processed = self.transform(df)
+        # save processed df
+        save_dataframe_filename(df=df_processed, save_dir=out_path.parent, filename=out_path.name)
+    def __str__(self) -> str:
+        """
+        Provides a detailed, human-readable string representation of the
+        entire processing pipeline.
+        """
+        header = "DragonProcessor Pipeline"
+        divider = "-" * len(header)
+        num_steps = len(self._recipe)
+        lines = [
+            header,
+            divider,
+            f"Number of steps: {num_steps}\n"
+        ]
+        if num_steps == 0:
+            lines.append("No transformation steps defined.")
+            return "\n".join(lines)
+        for i, step in enumerate(self._recipe, 1):
+            transform_action = step["transform"]
+            # Get a clean name for the transformation action
+            if transform_action == MagicWords.RENAME: # "rename"
+                transform_name = "Rename"
+            else:
+                # This works for both functions and class instances
+                transform_name = type(transform_action).__name__
+            lines.append(f"[{i}] Input: '{step['input_col']}'")
+            lines.append(f"    - Transform: {transform_name}")
+            lines.append(f"    - Output(s): {step['output_col']}")
+            if i < num_steps:
+                lines.append("") # Add a blank line between steps
+        return "\n".join(lines)
+    def inspect(self) -> None:
+        """
+        Prints the detailed string representation of the pipeline to the console.
+        """
+        print(self)

ml_tools/ETL_engineering/_imprimir.py ADDED Viewed

@@ -0,0 +1,24 @@
+from .._core import _imprimir_disponibles
+_GRUPOS = [
+    "DragonTransformRecipe",
+    "DragonProcessor",
+    "BinaryTransformer",
+    "MultiBinaryDummifier",
+    "AutoDummifier",
+    "KeywordDummifier",
+    "NumberExtractor",
+    "MultiNumberExtractor",
+    "TemperatureExtractor",
+    "MultiTemperatureExtractor",
+    "RatioCalculator",
+    "TriRatioCalculator",
+    "CategoryMapper",
+    "RegexMapper",
+    "ValueBinner",
+    "DateFeatureExtractor",
+    "MolecularFormulaTransformer"
+]
+def info():
+    _imprimir_disponibles(_GRUPOS)

dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl

dragon-ml-toolbox 19.14.0py3-none-any.whl → 20.0.0py3-none-any.whl