dragon-ml-toolbox 3.4.0__tar.gz → 3.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (30) hide show
  1. {dragon_ml_toolbox-3.4.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.5.1}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/ETL_engineering.py +88 -77
  4. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/data_exploration.py +1 -1
  5. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/pyproject.toml +1 -1
  6. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/LICENSE +0 -0
  7. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/LICENSE-THIRD-PARTY.md +0 -0
  8. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/README.md +0 -0
  9. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  10. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  11. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  12. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  13. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/GUI_tools.py +0 -0
  14. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/MICE_imputation.py +0 -0
  15. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/ML_callbacks.py +0 -0
  16. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/ML_evaluation.py +0 -0
  17. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/ML_trainer.py +0 -0
  18. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/ML_tutorial.py +0 -0
  19. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/PSO_optimization.py +0 -0
  20. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/RNN_forecast.py +0 -0
  21. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/VIF_factor.py +0 -0
  22. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/__init__.py +0 -0
  23. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/_particle_swarm_optimization.py +0 -0
  24. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/_pytorch_models.py +0 -0
  25. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/datasetmaster.py +0 -0
  26. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/ensemble_learning.py +0 -0
  27. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/handle_excel.py +0 -0
  28. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/logger.py +0 -0
  29. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/ml_tools/utilities.py +0 -0
  30. {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.4.0
3
+ Version: 3.5.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.4.0
3
+ Version: 3.5.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -2,7 +2,6 @@ import polars as pl
2
2
  import re
3
3
  from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
4
4
  from .utilities import _script_info
5
- import pandas as pd
6
5
  from .logger import _LOGGER
7
6
 
8
7
 
@@ -24,124 +23,136 @@ __all__ = [
24
23
  ]
25
24
 
26
25
  ########## EXTRACT and CLEAN ##########
27
-
28
26
  class ColumnCleaner:
29
27
  """
30
- Cleans and standardizes a pandas Series by applying regex-to-replacement rules.
31
- Supports sub-string replacements and case-insensitivity.
28
+ A configuration object that defines cleaning rules for a single Polars DataFrame column.
29
+
30
+ This class holds a dictionary of regex-to-replacement rules, the target column name,
31
+ and the case-sensitivity setting. It is intended to be used with the DataFrameCleaner.
32
32
 
33
33
  Notes:
34
- - Write separate, specific rules for each case. Don't combine patterns with an "OR".
35
- - Define rules from most specific to more general to create a fallback system.
36
- - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
37
-
34
+ - Define rules from most specific to more general to create a fallback system.
35
+ - Beware of chain replacements (rules matching strings that have already been
36
+ changed by a previous rule in the same cleaner).
37
+
38
38
  Args:
39
+ column_name (str):
40
+ The name of the column to be cleaned.
39
41
  rules (Dict[str, str]):
40
42
  A dictionary of regex patterns to replacement strings. Can use
41
- backreferences in the replacement statement (e.g., r'\\1 \\2 \\3 \\4 \\5') for captured groups.
43
+ backreferences (e.g., r'$1 $2') for captured groups. Note that Polars
44
+ uses a '$' prefix for backreferences.
42
45
  case_insensitive (bool):
43
- If True, regex matching ignores case.
46
+ If True (default), regex matching ignores case.
47
+
48
+ ## Usage Example
49
+
50
+ ```python
51
+ id_rules = {
52
+ # Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
53
+ r'ID[- ](\d+)': r'ID:$1'
54
+ }
55
+
56
+ id_cleaner = ColumnCleaner(column_name='user_id', rules=id_rules)
57
+ # This object would then be passed to a DataFrameCleaner.
58
+ ```
44
59
  """
45
- def __init__(self, rules: Dict[str, str], case_insensitive: bool = True):
60
+ def __init__(self, column_name: str, rules: Dict[str, str], case_insensitive: bool = True):
61
+ if not isinstance(column_name, str) or not column_name:
62
+ raise TypeError("The 'column_name' must be a non-empty string.")
46
63
  if not isinstance(rules, dict):
47
64
  raise TypeError("The 'rules' argument must be a dictionary.")
48
65
 
49
- # Validate regex patterns
66
+ # Validate each regex pattern for correctness
50
67
  for pattern in rules.keys():
51
68
  try:
52
69
  re.compile(pattern)
53
70
  except re.error as e:
54
71
  raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
55
72
 
73
+ self.column_name = column_name
56
74
  self.rules = rules
57
75
  self.case_insensitive = case_insensitive
58
76
 
59
- def clean(self, series: pd.Series) -> pd.Series:
60
- """
61
- Applies the standardization rules sequentially to the provided Series.
62
-
63
- Args:
64
- series (pd.Series): The pandas Series to clean.
65
-
66
- Returns:
67
- pd.Series: A new Series with the regex replacements applied.
68
- """
69
- cleaned_series = series.astype(str)
70
-
71
- # Set the regex flags based on the case_insensitive setting
72
- flags = re.IGNORECASE if self.case_insensitive else 0
73
-
74
- # Sequentially apply each regex rule
75
- for pattern, replacement in self.rules.items():
76
- cleaned_series = cleaned_series.str.replace(
77
- pattern,
78
- replacement,
79
- regex=True,
80
- flags=flags
81
- )
82
-
83
- return cleaned_series
84
-
85
77
 
86
78
  class DataFrameCleaner:
87
79
  """
88
- Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
89
-
90
- Chosen case-sensitivity is applied to all columns.
91
-
92
- Notes:
93
- - Write separate, specific rules for each case. Don't combine patterns with an "OR".
94
- - Define rules from most specific to more general to create a fallback system.
95
- - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
80
+ Orchestrates cleaning multiple columns in a Polars DataFrame.
81
+
82
+ This class takes a list of ColumnCleaner objects and applies their defined
83
+ rules to the corresponding columns of a DataFrame using high-performance
84
+ Polars expressions.
96
85
 
97
86
  Args:
98
- rules (Dict[str, Dict[str, str]]):
99
- A nested dictionary where each top-level key is a column name,
100
- and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
87
+ cleaners (List[ColumnCleaner]):
88
+ A list of ColumnCleaner configuration objects.
89
+
90
+ Raises:
91
+ TypeError: If 'cleaners' is not a list or contains non-ColumnCleaner objects.
92
+ ValueError: If multiple ColumnCleaner objects target the same column.
101
93
  """
102
- def __init__(self, rules: Dict[str, Dict[str, str]], case_insensitive: bool = True):
103
- if not isinstance(rules, dict):
104
- raise TypeError("The 'rules' argument must be a nested dictionary.")
105
-
106
- for col_name, col_rules in rules.items():
107
- if not isinstance(col_rules, dict):
94
+ def __init__(self, cleaners: List[ColumnCleaner]):
95
+ if not isinstance(cleaners, list):
96
+ raise TypeError("The 'cleaners' argument must be a list of ColumnCleaner objects.")
97
+
98
+ seen_columns = set()
99
+ for cleaner in cleaners:
100
+ if not isinstance(cleaner, ColumnCleaner):
108
101
  raise TypeError(
109
- f"The value for column '{col_name}' must be a dictionary "
110
- f"of rules, but got type {type(col_rules).__name__}."
102
+ f"All items in 'cleaners' list must be ColumnCleaner objects, "
103
+ f"but found an object of type {type(cleaner).__name__}."
111
104
  )
112
-
113
- self.rules = rules
114
- self.case_insensitive = case_insensitive
105
+ if cleaner.column_name in seen_columns:
106
+ raise ValueError(
107
+ f"Duplicate ColumnCleaner found for column '{cleaner.column_name}'. "
108
+ "Each column should only have one cleaner."
109
+ )
110
+ seen_columns.add(cleaner.column_name)
111
+
112
+ self.cleaners = cleaners
115
113
 
116
- def clean(self, df: pd.DataFrame) -> pd.DataFrame:
114
+ def clean(self, df: pl.DataFrame) -> pl.DataFrame:
117
115
  """
118
- Applies all defined cleaning rules to the DataFrame.
116
+ Applies all defined cleaning rules to the Polars DataFrame.
119
117
 
120
118
  Args:
121
- df (pd.DataFrame): The pandas DataFrame to clean.
119
+ df (pl.DataFrame): The Polars DataFrame to clean.
122
120
 
123
121
  Returns:
124
- pd.DataFrame: A new, cleaned DataFrame.
122
+ pl.DataFrame: A new, cleaned Polars DataFrame.
123
+
124
+ Raises:
125
+ ValueError: If any columns specified in the cleaners are not found
126
+ in the input DataFrame.
125
127
  """
126
- rule_columns = set(self.rules.keys())
128
+ rule_columns = {c.column_name for c in self.cleaners}
127
129
  df_columns = set(df.columns)
128
-
129
130
  missing_columns = rule_columns - df_columns
130
-
131
+
131
132
  if missing_columns:
132
- # Report all missing columns in a single, clear error message
133
133
  raise ValueError(
134
- f"The following columns specified in the cleaning rules "
134
+ f"The following columns specified in cleaning rules "
135
135
  f"were not found in the DataFrame: {sorted(list(missing_columns))}"
136
136
  )
137
+
138
+ df_cleaned = df.clone()
137
139
 
138
- # Start the process
139
- df_cleaned = df.copy()
140
-
141
- for column_name, column_rules in self.rules.items():
142
- # Create and apply the specific cleaner for the column
143
- cleaner = ColumnCleaner(rules=column_rules, case_insensitive=self.case_insensitive)
144
- df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
140
+ # Build and apply a series of expressions for each column
141
+ for cleaner in self.cleaners:
142
+ col_name = cleaner.column_name
143
+
144
+ # Start with the column, cast to String for replacement operations
145
+ col_expr = pl.col(col_name).cast(pl.String)
146
+
147
+ # Sequentially chain 'replace_all' expressions for each rule
148
+ for pattern, replacement in cleaner.rules.items():
149
+ final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
150
+ col_expr = col_expr.str.replace_all(final_pattern, replacement)
151
+
152
+ # Execute the expression chain for the column
153
+ df_cleaned = df_cleaned.with_columns(col_expr.alias(col_name))
154
+
155
+ print(f"Cleaned {len(self.cleaners)} columns.")
145
156
 
146
157
  return df_cleaned
147
158
 
@@ -517,7 +528,7 @@ class KeywordDummifier:
517
528
 
518
529
  categorize_expr = categorize_expr.otherwise(None).alias("category")
519
530
 
520
- temp_df = pl.DataFrame(categorize_expr)
531
+ temp_df = pl.select(categorize_expr)
521
532
  df_with_dummies = temp_df.to_dummies(columns=["category"])
522
533
 
523
534
  final_columns = []
@@ -91,7 +91,7 @@ def drop_zero_only_columns(df: pd.DataFrame, verbose: bool=True) -> pd.DataFrame
91
91
  continue
92
92
 
93
93
  # For numeric columns, check if there's at least one non-zero value.
94
- if (column != 0).any():
94
+ if (column.fillna(0) != 0).any():
95
95
  cols_to_keep.append(col_name)
96
96
 
97
97
  dropped_columns = original_columns - set(cols_to_keep)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "3.4.0"
3
+ version = "3.5.1"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }