dragon-ml-toolbox 3.0.0__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.0.0
3
+ Version: 3.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
- dragon_ml_toolbox-3.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-3.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
- ml_tools/ETL_engineering.py,sha256=SRiloWhSpopS4ay8mzUu0H4e9-37Ox_jDHzODqsQ8pc,31642
1
+ dragon_ml_toolbox-3.1.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-3.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
+ ml_tools/ETL_engineering.py,sha256=qxLbw8Vc0lOHUJm5ou280Tvw3oh_G1UHonxfa7nu_4Q,33008
4
4
  ml_tools/GUI_tools.py,sha256=uFx6zIrQZzDPSTtOSHz8ptz-fxZiQz-lXHcrqwuYV_E,20385
5
5
  ml_tools/MICE_imputation.py,sha256=ed-YeQkEAeHxTNkWIHs09T4YeYNF0aqAnrUTcdIEp9E,11372
6
6
  ml_tools/ML_callbacks.py,sha256=gHZk-lyzAax6iEtG26zHuoobdAZCFJ6BmI6pWoXkOrw,13189
@@ -13,13 +13,13 @@ ml_tools/VIF_factor.py,sha256=5GVAldH69Vkei3WRUZN1uPBMzGoOOeEOA-bgmZXbbUw,10301
13
13
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
15
15
  ml_tools/_pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
16
- ml_tools/data_exploration.py,sha256=Fzbz_DKZ7F2e3-JbahLqKr3aP6lt9aCK9rNOHvR7nlA,23665
16
+ ml_tools/data_exploration.py,sha256=bOcCoQLeDPFJ7nB5Fsi16lzB22cG-c-mxObMsTetgS4,23655
17
17
  ml_tools/datasetmaster.py,sha256=N-uwfzWnl_qnoAqjbfS98I1pVNra5u6rhKLdWbFIReA,30122
18
18
  ml_tools/ensemble_learning.py,sha256=PPtBBLgLvaYOdY-MlcjXuxWWXf3JQavLNEysFgzjc_s,37470
19
19
  ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
20
20
  ml_tools/logger.py,sha256=jC4Q2OqmDm8ZO9VpuZqBSWdXryqaJvLscqVJ6caNMOk,6009
21
21
  ml_tools/utilities.py,sha256=opNR-ACH6BnLkWAKcb19ef5tFxfx22TI6E2o0RYwiGA,21021
22
- dragon_ml_toolbox-3.0.0.dist-info/METADATA,sha256=nmhUu0bwN4z1letePaDzGIQlmDUaBQ32esqGB-OasU4,3273
23
- dragon_ml_toolbox-3.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
- dragon_ml_toolbox-3.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
25
- dragon_ml_toolbox-3.0.0.dist-info/RECORD,,
22
+ dragon_ml_toolbox-3.1.0.dist-info/METADATA,sha256=yGk-slwRhPF23NxfVG0vR0NeIQbo_mJ-_ZEIomBLvrQ,3273
23
+ dragon_ml_toolbox-3.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
+ dragon_ml_toolbox-3.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
25
+ dragon_ml_toolbox-3.1.0.dist-info/RECORD,,
@@ -25,18 +25,26 @@ __all__ = [
25
25
 
26
26
  class ColumnCleaner:
27
27
  """
28
- Cleans and standardizes a single pandas Series based on a dictionary of regex-to-value replacement rules.
28
+ Cleans and standardizes a pandas Series by applying regex-to-replacement rules.
29
+ Supports sub-string replacements and case-insensitivity.
30
+
31
+ Notes:
32
+ - Write separate, specific rules for each case. Don't combine patterns with an "OR".
33
+ - Define rules from most specific to more general to create a fallback system.
34
+ - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
29
35
 
30
36
  Args:
31
37
  rules (Dict[str, str]):
32
- A dictionary where each key is a regular expression pattern and
33
- each value is the standardized string to replace matches with.
38
+ A dictionary of regex patterns to replacement strings. Can use
39
+ backreferences in the replacement statement (e.g., r'\\1 \\2 \\3 \\4 \\5') for captured groups.
40
+ case_insensitive (bool):
41
+ If True, regex matching ignores case.
34
42
  """
35
- def __init__(self, rules: Dict[str, str]):
43
+ def __init__(self, rules: Dict[str, str], case_insensitive: bool = True):
36
44
  if not isinstance(rules, dict):
37
45
  raise TypeError("The 'rules' argument must be a dictionary.")
38
46
 
39
- # Validate that all keys are valid regular expressions
47
+ # Validate regex patterns
40
48
  for pattern in rules.keys():
41
49
  try:
42
50
  re.compile(pattern)
@@ -44,32 +52,52 @@ class ColumnCleaner:
44
52
  raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
45
53
 
46
54
  self.rules = rules
55
+ self.case_insensitive = case_insensitive
47
56
 
48
57
  def clean(self, series: pd.Series) -> pd.Series:
49
58
  """
50
- Applies the standardization rules to the provided Series (requires string data).
59
+ Applies the standardization rules sequentially to the provided Series.
51
60
 
52
- Non-matching values are kept as they are.
53
-
54
61
  Args:
55
62
  series (pd.Series): The pandas Series to clean.
56
63
 
57
64
  Returns:
58
- pd.Series: A new Series with the values cleaned and standardized.
65
+ pd.Series: A new Series with the regex replacements applied.
59
66
  """
60
- return series.astype(str).replace(self.rules, regex=True)
67
+ cleaned_series = series.astype(str)
68
+
69
+ # Set the regex flags based on the case_insensitive setting
70
+ flags = re.IGNORECASE if self.case_insensitive else 0
71
+
72
+ # Sequentially apply each regex rule
73
+ for pattern, replacement in self.rules.items():
74
+ cleaned_series = cleaned_series.str.replace(
75
+ pattern,
76
+ replacement,
77
+ regex=True,
78
+ flags=flags
79
+ )
80
+
81
+ return cleaned_series
61
82
 
62
83
 
63
84
  class DataFrameCleaner:
64
85
  """
65
86
  Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
87
+
88
+ Chosen case-sensitivity is applied to all columns.
89
+
90
+ Notes:
91
+ - Write separate, specific rules for each case. Don't combine patterns with an "OR".
92
+ - Define rules from most specific to more general to create a fallback system.
93
+ - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
66
94
 
67
95
  Args:
68
96
  rules (Dict[str, Dict[str, str]]):
69
97
  A nested dictionary where each top-level key is a column name,
70
98
  and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
71
99
  """
72
- def __init__(self, rules: Dict[str, Dict[str, str]]):
100
+ def __init__(self, rules: Dict[str, Dict[str, str]], case_insensitive: bool = True):
73
101
  if not isinstance(rules, dict):
74
102
  raise TypeError("The 'rules' argument must be a nested dictionary.")
75
103
 
@@ -81,6 +109,7 @@ class DataFrameCleaner:
81
109
  )
82
110
 
83
111
  self.rules = rules
112
+ self.case_insensitive = case_insensitive
84
113
 
85
114
  def clean(self, df: pd.DataFrame) -> pd.DataFrame:
86
115
  """
@@ -109,7 +138,7 @@ class DataFrameCleaner:
109
138
 
110
139
  for column_name, column_rules in self.rules.items():
111
140
  # Create and apply the specific cleaner for the column
112
- cleaner = ColumnCleaner(rules=column_rules)
141
+ cleaner = ColumnCleaner(rules=column_rules, case_insensitive=self.case_insensitive)
113
142
  df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
114
143
 
115
144
  return df_cleaned
@@ -587,14 +587,14 @@ def standardize_percentages(
587
587
  Standardizes numeric columns containing mixed-format percentages.
588
588
 
589
589
  This function cleans columns where percentages might be entered as whole
590
- numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
590
+ numbers (55) and as proportions (0.55). It assumes values
591
591
  between 0 and 1 are proportions and multiplies them by 100.
592
592
 
593
593
  Args:
594
594
  df (pd.Dataframe): The input pandas DataFrame.
595
595
  columns (list[str]): A list of column names to standardize.
596
596
  treat_one_as_proportion (bool):
597
- - If True (default): The value `1` is treated as a proportion and converted to `100`.
597
+ - If True (default): The value `1` is treated as a proportion and converted to `100%`.
598
598
  - If False: The value `1` is treated as `1%`.
599
599
  round_digits (int): The number of decimal places to round the final result to.
600
600