PyPI - dragon-ml-toolbox - Versions diffs - 10.13.0__tar.gz → 10.15.0__tar.gz - Mend

dragon-ml-toolbox 10.13.0tar.gz → 10.15.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (41) hide show

{dragon_ml_toolbox-10.13.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.15.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.13.0
+Version: 10.15.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.13.0
+Version: 10.15.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ETL_cleaning.py RENAMED Viewed

@@ -96,7 +96,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
 ########## Basic df cleaners #############
-def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
+def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
     # Cleaning rules
     cleaning_rules = {
         # 1. Comprehensive Punctuation & Symbol Normalization
@@ -128,7 +128,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
         # Punctuation
         '》': '>', '《': '<', '：': ':', '。': '.', '；': ';', '【': '[', '】': ']',
         '（': '(', '）': ')', '？': '?', '！': '!', '～': '~', '＠': '@', '＃': '#', '＋': '+', '－': '-',
-        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '-', '｜': '|', '≈':'=',
+        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '-', '｜': '|', '≈':'=', '·': '-',
         # Commas (avoid commas in entries)
         '，': ';',
@@ -159,6 +159,9 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
         r'!{2,}': '!',      # Replace two or more exclamation marks with a single one
         r';{2,}': ';',
         r'-{2,}': '-',
+        r'/{2,}': '/',
+        r'%{2,}': '%',
+        r'&{2,}': '&',
         # 2. Internal Whitespace Consolidation
         # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -170,7 +173,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
         # 4. Textual Null Standardization (New Step)
         # Convert common null-like text to actual nulls.
-        r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;)$': None,
+        r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
         # 5. Final Nullification of Empty Strings
         # After all cleaning, if a string is now empty, convert it to a null
@@ -191,9 +194,13 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
         df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
         # apply lowercase to all string columns
-        df_final = df_cleaned.with_columns(
-            pl.col(pl.String).str.to_lowercase()
-        )
+        if all_lowercase:
+            df_final = df_cleaned.with_columns(
+                pl.col(pl.String).str.to_lowercase()
+            )
+        else:
+            df_final = df_cleaned
     except Exception as e:
         _LOGGER.error(f"An error occurred during the cleaning process.")
         raise e
@@ -211,7 +218,7 @@ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
     return input_path, output_path
-def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
+def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
     """
     Performs a comprehensive, standardized cleaning on all columns of a CSV file.
@@ -221,13 +228,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
     - Stripping any leading or trailing whitespace.
     - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
     - Converting strings that become empty after cleaning into true null values.
-    - Normalizing all text to lowercase.
+    - Normalizing all text to lowercase (Optional).
     Args:
-        input_filepath (Union[str, Path]):
+        input_filepath (str | Path):
             The path to the source CSV file to be cleaned.
-        output_filepath (Union[str, Path, None], optional):
+        output_filepath (str | Path):
             The path to save the cleaned CSV file.
+        all_lowercase (bool):
+            Whether to normalize all text to lowercase.
     """
     # Handle paths
     input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
@@ -236,7 +246,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
     df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
     # CLEAN
-    df_final = _cleaner_core(df)
+    df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
     # Save cleaned dataframe
     save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
@@ -245,7 +255,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
 def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
-                     skip_targets: bool=False, threshold: float=0.8):
+                     skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
     """
     Performs standardized cleaning followed by iterative removal of rows and
     columns with excessive missing data.
@@ -262,12 +272,12 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
     dropping process are saved to the specified log directory.
     Args:
-        input_filepath (str, Path):
+        input_filepath (str | Path):
             The path to the source CSV file to be cleaned.
-        output_filepath (str, Path):
+        output_filepath (str | Path):
             The path to save the fully cleaned CSV file after cleaning
             and missing-data-based pruning.
-        log_directory (str, Path):
+        log_directory (str | Path):
             Path to the directory where missing data reports will be stored.
         targets (list[str]):
             A list of column names to be treated as target variables.
@@ -279,6 +289,8 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
             The proportion of missing data required to drop a row or column.
             For example, 0.8 means a row/column will be dropped if 80% or more
             of its data is missing.
+        all_lowercase (bool):
+            Whether to normalize all text to lowercase.
     """
     # handle log path
     log_path = make_fullpath(log_directory, make=True, enforce="directory")
@@ -290,7 +302,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
     df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
     # CLEAN
-    df_cleaned = _cleaner_core(df)
+    df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
     # switch to pandas
     df_cleaned_pandas = df_cleaned.to_pandas()

{dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ML_optimization.py RENAMED Viewed

@@ -20,12 +20,112 @@ from .SQL import DatabaseManager
 from .optimization_tools import _save_result
 from .utilities import threshold_binary_values, save_dataframe
 __all__ = [
+    "MLOptimizer",
     "create_pytorch_problem",
     "run_optimization"
 ]
+class MLOptimizer:
+    """
+    A wrapper class for setting up and running EvoTorch optimization tasks.
+    This class combines the functionality of `create_pytorch_problem` and
+    `run_optimization` functions into a single, streamlined workflow.
+    SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
+    Example:
+        >>> # 1. Initialize the optimizer with model and search parameters
+        >>> optimizer = MLOptimizer(
+        ...     inference_handler=my_handler,
+        ...     bounds=(lower_bounds, upper_bounds),
+        ...     number_binary_features=2,
+        ...     task="max",
+        ...     algorithm="Genetic"
+        ... )
+        >>> # 2. Run the optimization and save the results
+        >>> best_result = optimizer.run(
+        ...     num_generations=100,
+        ...     target_name="my_target",
+        ...     feature_names=my_feature_names,
+        ...     save_dir="/path/to/results",
+        ...     save_format="csv"
+        ... )
+    """
+    def __init__(self,
+                 inference_handler: PyTorchInferenceHandler,
+                 bounds: Tuple[List[float], List[float]],
+                 number_binary_features: int,
+                 task: Literal["min", "max"],
+                 algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
+                 population_size: int = 200,
+                 **searcher_kwargs):
+        """
+        Initializes the optimizer by creating the EvoTorch problem and searcher.
+        Args:
+            inference_handler (PyTorchInferenceHandler): An initialized inference handler containing the model and weights.
+            bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for the solution features.
+            number_binary_features (int): Number of binary features located at the END of the feature vector.
+            task (str): The optimization goal, either "min" or "max".
+            algorithm (str): The search algorithm to use ("SNES", "CEM", "Genetic").
+            population_size (int): Population size for CEM and GeneticAlgorithm.
+            **searcher_kwargs: Additional keyword arguments for the selected search algorithm's constructor.
+        """
+        # Call the existing factory function to get the problem and searcher factory
+        self.problem, self.searcher_factory = create_pytorch_problem(
+            inference_handler=inference_handler,
+            bounds=bounds,
+            binary_features=number_binary_features,
+            task=task,
+            algorithm=algorithm,
+            population_size=population_size,
+            **searcher_kwargs
+        )
+        # Store binary_features count to pass it to the run function later
+        self._binary_features = number_binary_features
+    def run(self,
+            num_generations: int,
+            target_name: str,
+            save_dir: Union[str, Path],
+            feature_names: Optional[List[str]],
+            save_format: Literal['csv', 'sqlite', 'both'],
+            repetitions: int = 1,
+            verbose: bool = True) -> Optional[dict]:
+        """
+        Runs the evolutionary optimization process using the pre-configured settings.
+        Args:
+            num_generations (int): The total number of generations for each repetition.
+            target_name (str): Target name used for the CSV filename and/or SQL table.
+            save_dir (str | Path): The directory where result files will be saved.
+            feature_names (List[str] | None): Names of the solution features for labeling output. If None, generic names like 'feature_0', 'feature_1', ... , will be created.
+            save_format (Literal['csv', 'sqlite', 'both']): The format for saving results.
+            repetitions (int): The number of independent times to run the optimization.
+            verbose (bool): If True, enables detailed logging.
+        Returns:
+            Optional[dict]: A dictionary with the best result if repetitions is 1, otherwise None.
+        """
+        # Call the existing run function with the stored problem, searcher, and binary feature count
+        return run_optimization(
+            problem=self.problem,
+            searcher_factory=self.searcher_factory,
+            num_generations=num_generations,
+            target_name=target_name,
+            binary_features=self._binary_features,
+            save_dir=save_dir,
+            save_format=save_format,
+            feature_names=feature_names,
+            repetitions=repetitions,
+            verbose=verbose
+        )
 def create_pytorch_problem(
     inference_handler: PyTorchInferenceHandler,
     bounds: Tuple[List[float], List[float]],
@@ -38,7 +138,7 @@ def create_pytorch_problem(
     """
     Creates and configures an EvoTorch Problem and a Searcher factory class for a PyTorch model.
-    SNES and CEM do not accept bounds, the given bounds will be used as initial bounds only.
+    SNES and CEM do not accept bounds, the given bounds will be used as an initial starting point.
     The Genetic Algorithm works directly with the bounds, and operators such as SimulatedBinaryCrossOver and GaussianMutation.
@@ -62,8 +162,8 @@ def create_pytorch_problem(
     # add binary bounds
     if binary_features > 0:
-        lower_bounds.extend([0.45] * binary_features)
-        upper_bounds.extend([0.55] * binary_features)
+        lower_bounds.extend([0.48] * binary_features)
+        upper_bounds.extend([0.52] * binary_features)
     solution_length = len(lower_bounds)
     device = inference_handler.device

{dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "10.13.0"
+version = "10.15.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }