dragon-ml-toolbox 3.4.0__tar.gz → 3.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-3.4.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.5.0}/PKG-INFO +1 -1
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/ETL_engineering.py +88 -76
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/pyproject.toml +1 -1
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/LICENSE +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/README.md +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/ML_tutorial.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/_particle_swarm_optimization.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/_pytorch_models.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/data_exploration.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/datasetmaster.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/logger.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/setup.cfg +0 -0
|
@@ -2,7 +2,6 @@ import polars as pl
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
|
|
4
4
|
from .utilities import _script_info
|
|
5
|
-
import pandas as pd
|
|
6
5
|
from .logger import _LOGGER
|
|
7
6
|
|
|
8
7
|
|
|
@@ -24,124 +23,137 @@ __all__ = [
|
|
|
24
23
|
]
|
|
25
24
|
|
|
26
25
|
########## EXTRACT and CLEAN ##########
|
|
27
|
-
|
|
28
26
|
class ColumnCleaner:
|
|
29
27
|
"""
|
|
30
|
-
|
|
31
|
-
|
|
28
|
+
A configuration object that defines cleaning rules for a single Polars DataFrame column.
|
|
29
|
+
|
|
30
|
+
This class holds a dictionary of regex-to-replacement rules, the target column name,
|
|
31
|
+
and the case-sensitivity setting. It is intended to be used with the DataFrameCleaner.
|
|
32
32
|
|
|
33
33
|
Notes:
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
34
|
+
- Define rules from most specific to more general to create a fallback system.
|
|
35
|
+
- Beware of chain replacements (rules matching strings that have already been
|
|
36
|
+
changed by a previous rule in the same cleaner).
|
|
37
|
+
|
|
38
38
|
Args:
|
|
39
|
+
column_name (str):
|
|
40
|
+
The name of the column to be cleaned.
|
|
39
41
|
rules (Dict[str, str]):
|
|
40
42
|
A dictionary of regex patterns to replacement strings. Can use
|
|
41
|
-
backreferences
|
|
43
|
+
backreferences (e.g., r'$1 $2') for captured groups. Note that Polars
|
|
44
|
+
uses a '$' prefix for backreferences.
|
|
42
45
|
case_insensitive (bool):
|
|
43
|
-
If True, regex matching ignores case.
|
|
46
|
+
If True (default), regex matching ignores case.
|
|
47
|
+
|
|
48
|
+
## Usage Example
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
phone_rules = {
|
|
52
|
+
# Matches (123) 456-7890 and reformats to 123-456-7890
|
|
53
|
+
r'\((\d{3})\)\s*(\d{3})-(\d{4})': r'$1-$2-$3'
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
phone_cleaner = ColumnCleaner(column_name='phone_number', rules=phone_rules)
|
|
57
|
+
|
|
58
|
+
# This object would then be passed to a DataFrameCleaner.
|
|
59
|
+
```
|
|
44
60
|
"""
|
|
45
|
-
def __init__(self, rules: Dict[str, str], case_insensitive: bool = True):
|
|
61
|
+
def __init__(self, column_name: str, rules: Dict[str, str], case_insensitive: bool = True):
|
|
62
|
+
if not isinstance(column_name, str) or not column_name:
|
|
63
|
+
raise TypeError("The 'column_name' must be a non-empty string.")
|
|
46
64
|
if not isinstance(rules, dict):
|
|
47
65
|
raise TypeError("The 'rules' argument must be a dictionary.")
|
|
48
66
|
|
|
49
|
-
# Validate regex
|
|
67
|
+
# Validate each regex pattern for correctness
|
|
50
68
|
for pattern in rules.keys():
|
|
51
69
|
try:
|
|
52
70
|
re.compile(pattern)
|
|
53
71
|
except re.error as e:
|
|
54
72
|
raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
|
|
55
73
|
|
|
74
|
+
self.column_name = column_name
|
|
56
75
|
self.rules = rules
|
|
57
76
|
self.case_insensitive = case_insensitive
|
|
58
77
|
|
|
59
|
-
def clean(self, series: pd.Series) -> pd.Series:
|
|
60
|
-
"""
|
|
61
|
-
Applies the standardization rules sequentially to the provided Series.
|
|
62
|
-
|
|
63
|
-
Args:
|
|
64
|
-
series (pd.Series): The pandas Series to clean.
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
pd.Series: A new Series with the regex replacements applied.
|
|
68
|
-
"""
|
|
69
|
-
cleaned_series = series.astype(str)
|
|
70
|
-
|
|
71
|
-
# Set the regex flags based on the case_insensitive setting
|
|
72
|
-
flags = re.IGNORECASE if self.case_insensitive else 0
|
|
73
|
-
|
|
74
|
-
# Sequentially apply each regex rule
|
|
75
|
-
for pattern, replacement in self.rules.items():
|
|
76
|
-
cleaned_series = cleaned_series.str.replace(
|
|
77
|
-
pattern,
|
|
78
|
-
replacement,
|
|
79
|
-
regex=True,
|
|
80
|
-
flags=flags
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
return cleaned_series
|
|
84
|
-
|
|
85
78
|
|
|
86
79
|
class DataFrameCleaner:
|
|
87
80
|
"""
|
|
88
|
-
Orchestrates
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
- Write separate, specific rules for each case. Don't combine patterns with an "OR".
|
|
94
|
-
- Define rules from most specific to more general to create a fallback system.
|
|
95
|
-
- Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
|
|
81
|
+
Orchestrates cleaning multiple columns in a Polars DataFrame.
|
|
82
|
+
|
|
83
|
+
This class takes a list of ColumnCleaner objects and applies their defined
|
|
84
|
+
rules to the corresponding columns of a DataFrame using high-performance
|
|
85
|
+
Polars expressions.
|
|
96
86
|
|
|
97
87
|
Args:
|
|
98
|
-
|
|
99
|
-
A
|
|
100
|
-
|
|
88
|
+
cleaners (List[ColumnCleaner]):
|
|
89
|
+
A list of ColumnCleaner configuration objects.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
TypeError: If 'cleaners' is not a list or contains non-ColumnCleaner objects.
|
|
93
|
+
ValueError: If multiple ColumnCleaner objects target the same column.
|
|
101
94
|
"""
|
|
102
|
-
def __init__(self,
|
|
103
|
-
if not isinstance(
|
|
104
|
-
raise TypeError("The '
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
95
|
+
def __init__(self, cleaners: List[ColumnCleaner]):
|
|
96
|
+
if not isinstance(cleaners, list):
|
|
97
|
+
raise TypeError("The 'cleaners' argument must be a list of ColumnCleaner objects.")
|
|
98
|
+
|
|
99
|
+
seen_columns = set()
|
|
100
|
+
for cleaner in cleaners:
|
|
101
|
+
if not isinstance(cleaner, ColumnCleaner):
|
|
108
102
|
raise TypeError(
|
|
109
|
-
f"
|
|
110
|
-
f"
|
|
103
|
+
f"All items in 'cleaners' list must be ColumnCleaner objects, "
|
|
104
|
+
f"but found an object of type {type(cleaner).__name__}."
|
|
111
105
|
)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
106
|
+
if cleaner.column_name in seen_columns:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"Duplicate ColumnCleaner found for column '{cleaner.column_name}'. "
|
|
109
|
+
"Each column should only have one cleaner."
|
|
110
|
+
)
|
|
111
|
+
seen_columns.add(cleaner.column_name)
|
|
115
112
|
|
|
116
|
-
|
|
113
|
+
self.cleaners = cleaners
|
|
114
|
+
|
|
115
|
+
def clean(self, df: pl.DataFrame) -> pl.DataFrame:
|
|
117
116
|
"""
|
|
118
|
-
Applies all defined cleaning rules to the DataFrame.
|
|
117
|
+
Applies all defined cleaning rules to the Polars DataFrame.
|
|
119
118
|
|
|
120
119
|
Args:
|
|
121
|
-
df (
|
|
120
|
+
df (pl.DataFrame): The Polars DataFrame to clean.
|
|
122
121
|
|
|
123
122
|
Returns:
|
|
124
|
-
|
|
123
|
+
pl.DataFrame: A new, cleaned Polars DataFrame.
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
ValueError: If any columns specified in the cleaners are not found
|
|
127
|
+
in the input DataFrame.
|
|
125
128
|
"""
|
|
126
|
-
rule_columns =
|
|
129
|
+
rule_columns = {c.column_name for c in self.cleaners}
|
|
127
130
|
df_columns = set(df.columns)
|
|
128
|
-
|
|
129
131
|
missing_columns = rule_columns - df_columns
|
|
130
|
-
|
|
132
|
+
|
|
131
133
|
if missing_columns:
|
|
132
|
-
# Report all missing columns in a single, clear error message
|
|
133
134
|
raise ValueError(
|
|
134
|
-
f"The following columns specified in
|
|
135
|
+
f"The following columns specified in cleaning rules "
|
|
135
136
|
f"were not found in the DataFrame: {sorted(list(missing_columns))}"
|
|
136
137
|
)
|
|
138
|
+
|
|
139
|
+
df_cleaned = df.clone()
|
|
137
140
|
|
|
138
|
-
#
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
#
|
|
143
|
-
|
|
144
|
-
|
|
141
|
+
# Build and apply a series of expressions for each column
|
|
142
|
+
for cleaner in self.cleaners:
|
|
143
|
+
col_name = cleaner.column_name
|
|
144
|
+
|
|
145
|
+
# Start with the column, cast to String for replacement operations
|
|
146
|
+
col_expr = pl.col(col_name).cast(pl.String)
|
|
147
|
+
|
|
148
|
+
# Sequentially chain 'replace_all' expressions for each rule
|
|
149
|
+
for pattern, replacement in cleaner.rules.items():
|
|
150
|
+
final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
|
|
151
|
+
col_expr = col_expr.str.replace_all(final_pattern, replacement)
|
|
152
|
+
|
|
153
|
+
# Execute the expression chain for the column
|
|
154
|
+
df_cleaned = df_cleaned.with_columns(col_expr.alias(col_name))
|
|
155
|
+
|
|
156
|
+
print(f"Cleaned {len(self.cleaners)} columns.")
|
|
145
157
|
|
|
146
158
|
return df_cleaned
|
|
147
159
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/dragon_ml_toolbox.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-3.4.0 → dragon_ml_toolbox-3.5.0}/ml_tools/_particle_swarm_optimization.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|