PyPI - dragon-ml-toolbox - Versions diffs - 3.0.0__py3-none-any.whl → 3.1.0__py3-none-any.whl - Mend

dragon-ml-toolbox 3.0.0py3-none-any.whl → 3.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (8) hide show

{dragon_ml_toolbox-3.0.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.0.0
+Version: 3.1.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.0.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
-dragon_ml_toolbox-3.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-3.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
-ml_tools/ETL_engineering.py,sha256=SRiloWhSpopS4ay8mzUu0H4e9-37Ox_jDHzODqsQ8pc,31642
+dragon_ml_toolbox-3.1.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-3.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
+ml_tools/ETL_engineering.py,sha256=qxLbw8Vc0lOHUJm5ou280Tvw3oh_G1UHonxfa7nu_4Q,33008
 ml_tools/GUI_tools.py,sha256=uFx6zIrQZzDPSTtOSHz8ptz-fxZiQz-lXHcrqwuYV_E,20385
 ml_tools/MICE_imputation.py,sha256=ed-YeQkEAeHxTNkWIHs09T4YeYNF0aqAnrUTcdIEp9E,11372
 ml_tools/ML_callbacks.py,sha256=gHZk-lyzAax6iEtG26zHuoobdAZCFJ6BmI6pWoXkOrw,13189
@@ -13,13 +13,13 @@ ml_tools/VIF_factor.py,sha256=5GVAldH69Vkei3WRUZN1uPBMzGoOOeEOA-bgmZXbbUw,10301
 ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
 ml_tools/_pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
-ml_tools/data_exploration.py,sha256=Fzbz_DKZ7F2e3-JbahLqKr3aP6lt9aCK9rNOHvR7nlA,23665
+ml_tools/data_exploration.py,sha256=bOcCoQLeDPFJ7nB5Fsi16lzB22cG-c-mxObMsTetgS4,23655
 ml_tools/datasetmaster.py,sha256=N-uwfzWnl_qnoAqjbfS98I1pVNra5u6rhKLdWbFIReA,30122
 ml_tools/ensemble_learning.py,sha256=PPtBBLgLvaYOdY-MlcjXuxWWXf3JQavLNEysFgzjc_s,37470
 ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
 ml_tools/logger.py,sha256=jC4Q2OqmDm8ZO9VpuZqBSWdXryqaJvLscqVJ6caNMOk,6009
 ml_tools/utilities.py,sha256=opNR-ACH6BnLkWAKcb19ef5tFxfx22TI6E2o0RYwiGA,21021
-dragon_ml_toolbox-3.0.0.dist-info/METADATA,sha256=nmhUu0bwN4z1letePaDzGIQlmDUaBQ32esqGB-OasU4,3273
-dragon_ml_toolbox-3.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-3.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-3.0.0.dist-info/RECORD,,
+dragon_ml_toolbox-3.1.0.dist-info/METADATA,sha256=yGk-slwRhPF23NxfVG0vR0NeIQbo_mJ-_ZEIomBLvrQ,3273
+dragon_ml_toolbox-3.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-3.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-3.1.0.dist-info/RECORD,,

ml_tools/ETL_engineering.py CHANGED Viewed

@@ -25,18 +25,26 @@ __all__ = [
 class ColumnCleaner:
     """
-    Cleans and standardizes a single pandas Series based on a dictionary of regex-to-value replacement rules.
+    Cleans and standardizes a pandas Series by applying regex-to-replacement rules.
+    Supports sub-string replacements and case-insensitivity.
+    Notes:
+    - Write separate, specific rules for each case. Don't combine patterns with an "OR".
+    - Define rules from most specific to more general to create a fallback system.
+    - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
     Args:
         rules (Dict[str, str]):
-            A dictionary where each key is a regular expression pattern and
-            each value is the standardized string to replace matches with.
+            A dictionary of regex patterns to replacement strings. Can use
+            backreferences in the replacement statement (e.g., r'\\1 \\2 \\3 \\4 \\5') for captured groups.
+        case_insensitive (bool):
+            If True, regex matching ignores case.
     """
-    def __init__(self, rules: Dict[str, str]):
+    def __init__(self, rules: Dict[str, str], case_insensitive: bool = True):
         if not isinstance(rules, dict):
             raise TypeError("The 'rules' argument must be a dictionary.")
-        # Validate that all keys are valid regular expressions
+        # Validate regex patterns
         for pattern in rules.keys():
             try:
                 re.compile(pattern)
@@ -44,32 +52,52 @@ class ColumnCleaner:
                 raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
         self.rules = rules
+        self.case_insensitive = case_insensitive
     def clean(self, series: pd.Series) -> pd.Series:
         """
-        Applies the standardization rules to the provided Series (requires string data).
+        Applies the standardization rules sequentially to the provided Series.
-        Non-matching values are kept as they are.
         Args:
             series (pd.Series): The pandas Series to clean.
         Returns:
-            pd.Series: A new Series with the values cleaned and standardized.
+            pd.Series: A new Series with the regex replacements applied.
         """
-        return series.astype(str).replace(self.rules, regex=True)
+        cleaned_series = series.astype(str)
+        # Set the regex flags based on the case_insensitive setting
+        flags = re.IGNORECASE if self.case_insensitive else 0
+        # Sequentially apply each regex rule
+        for pattern, replacement in self.rules.items():
+            cleaned_series = cleaned_series.str.replace(
+                pattern,
+                replacement,
+                regex=True,
+                flags=flags
+            )
+        return cleaned_series
 class DataFrameCleaner:
     """
     Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
+    Chosen case-sensitivity is applied to all columns.
+    Notes:
+    - Write separate, specific rules for each case. Don't combine patterns with an "OR".
+    - Define rules from most specific to more general to create a fallback system.
+    - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
     Args:
         rules (Dict[str, Dict[str, str]]):
             A nested dictionary where each top-level key is a column name,
             and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
     """
-    def __init__(self, rules: Dict[str, Dict[str, str]]):
+    def __init__(self, rules: Dict[str, Dict[str, str]], case_insensitive: bool = True):
         if not isinstance(rules, dict):
             raise TypeError("The 'rules' argument must be a nested dictionary.")
@@ -81,6 +109,7 @@ class DataFrameCleaner:
                 )
         self.rules = rules
+        self.case_insensitive = case_insensitive
     def clean(self, df: pd.DataFrame) -> pd.DataFrame:
         """
@@ -109,7 +138,7 @@ class DataFrameCleaner:
         for column_name, column_rules in self.rules.items():
             # Create and apply the specific cleaner for the column
-            cleaner = ColumnCleaner(rules=column_rules)
+            cleaner = ColumnCleaner(rules=column_rules, case_insensitive=self.case_insensitive)
             df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
         return df_cleaned

ml_tools/data_exploration.py CHANGED Viewed

@@ -587,14 +587,14 @@ def standardize_percentages(
     Standardizes numeric columns containing mixed-format percentages.
     This function cleans columns where percentages might be entered as whole
-    numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
+    numbers (55) and as proportions (0.55). It assumes values
     between 0 and 1 are proportions and multiplies them by 100.
     Args:
         df (pd.Dataframe): The input pandas DataFrame.
         columns (list[str]): A list of column names to standardize.
         treat_one_as_proportion (bool):
-            - If True (default): The value `1` is treated as a proportion and converted to `100`.
+            - If True (default): The value `1` is treated as a proportion and converted to `100%`.
             - If False: The value `1` is treated as `1%`.
         round_digits (int): The number of decimal places to round the final result to.

{dragon_ml_toolbox-3.0.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-3.0.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-3.0.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-3.0.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 3.0.0__py3-none-any.whl → 3.1.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 3.0.0py3-none-any.whl → 3.1.0py3-none-any.whl