dragon-ml-toolbox 2.2.0__tar.gz → 2.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (26) hide show
  1. {dragon_ml_toolbox-2.2.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-2.2.1}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/ETL_engineering.py +61 -3
  4. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/data_exploration.py +69 -1
  5. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/pyproject.toml +1 -1
  6. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/LICENSE +0 -0
  7. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/LICENSE-THIRD-PARTY.md +0 -0
  8. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/README.md +0 -0
  9. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  10. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  11. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  12. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  13. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/MICE_imputation.py +0 -0
  14. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/PSO_optimization.py +0 -0
  15. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/VIF_factor.py +0 -0
  16. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/__init__.py +0 -0
  17. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/_particle_swarm_optimization.py +0 -0
  18. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/datasetmaster.py +0 -0
  19. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/ensemble_learning.py +0 -0
  20. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/handle_excel.py +0 -0
  21. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/logger.py +0 -0
  22. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/pytorch_models.py +0 -0
  23. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/trainer.py +0 -0
  24. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/utilities.py +0 -0
  25. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/vision_helpers.py +0 -0
  26. {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 2.2.0
3
+ Version: 2.2.1
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 2.2.0
3
+ Version: 2.2.1
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -101,7 +101,7 @@ class DataProcessor:
101
101
  raise TypeError("The recipe must be an instance of TransformationRecipe.")
102
102
  if len(recipe) == 0:
103
103
  raise ValueError("The recipe cannot be empty.")
104
- self.recipe = recipe
104
+ self._recipe = recipe
105
105
 
106
106
  def transform(self, df: pl.DataFrame) -> pl.DataFrame:
107
107
  """
@@ -109,7 +109,7 @@ class DataProcessor:
109
109
  """
110
110
  processed_columns = []
111
111
  # Recipe object is iterable
112
- for step in self.recipe:
112
+ for step in self._recipe:
113
113
  input_col_name = step["input_col"]
114
114
  output_col_spec = step["output_col"]
115
115
  transform_action = step["transform"]
@@ -154,6 +154,49 @@ class DataProcessor:
154
154
  return pl.DataFrame()
155
155
 
156
156
  return pl.DataFrame(processed_columns)
157
+
158
+ def __str__(self) -> str:
159
+ """
160
+ Provides a detailed, human-readable string representation of the
161
+ entire processing pipeline.
162
+ """
163
+ header = "DataProcessor Pipeline"
164
+ divider = "-" * len(header)
165
+ num_steps = len(self._recipe)
166
+
167
+ lines = [
168
+ header,
169
+ divider,
170
+ f"Number of steps: {num_steps}\n"
171
+ ]
172
+
173
+ if num_steps == 0:
174
+ lines.append("No transformation steps defined.")
175
+ return "\n".join(lines)
176
+
177
+ for i, step in enumerate(self._recipe, 1):
178
+ transform_action = step["transform"]
179
+
180
+ # Get a clean name for the transformation action
181
+ if transform_action == _RENAME: # "rename"
182
+ transform_name = "Rename"
183
+ else:
184
+ # This works for both functions and class instances
185
+ transform_name = type(transform_action).__name__
186
+
187
+ lines.append(f"[{i}] Input: '{step['input_col']}'")
188
+ lines.append(f" - Transform: {transform_name}")
189
+ lines.append(f" - Output(s): {step['output_col']}")
190
+ if i < num_steps:
191
+ lines.append("") # Add a blank line between steps
192
+
193
+ return "\n".join(lines)
194
+
195
+ def inspect(self) -> None:
196
+ """
197
+ Prints the detailed string representation of the pipeline to the console.
198
+ """
199
+ print(self)
157
200
 
158
201
 
159
202
  class KeywordDummifier:
@@ -407,7 +450,22 @@ class CategoryMapper:
407
450
  pl.Series: A new Series with categories mapped to numbers.
408
451
  """
409
452
  # Ensure the column is treated as a string for matching keys
410
- return column.cast(pl.Utf8).map_dict(self.mapping, default=self.default_value)
453
+ str_column = column.cast(pl.Utf8)
454
+
455
+ # Create a list of 'when/then' expressions, one for each mapping
456
+ mapping_expressions = [
457
+ pl.when(str_column == from_val).then(pl.lit(to_val))
458
+ for from_val, to_val in self.mapping.items()
459
+ ]
460
+
461
+ # Use coalesce to find the first non-null value.
462
+ # The default_value acts as the final fallback.
463
+ final_expr = pl.coalesce(
464
+ *mapping_expressions, # Unpack the list of expressions
465
+ pl.lit(self.default_value)
466
+ )
467
+
468
+ return pl.select(final_expr).to_series()
411
469
 
412
470
 
413
471
  class ValueBinner:
@@ -1,4 +1,5 @@
1
1
  import pandas as pd
2
+ from pandas.api.types import is_numeric_dtype
2
3
  import numpy as np
3
4
  import matplotlib.pyplot as plt
4
5
  import seaborn as sns
@@ -24,7 +25,8 @@ __all__ = [
24
25
  "plot_value_distributions",
25
26
  "clip_outliers_single",
26
27
  "clip_outliers_multi",
27
- "match_and_filter_columns_by_regex"
28
+ "match_and_filter_columns_by_regex",
29
+ "standardize_percentages"
28
30
  ]
29
31
 
30
32
 
@@ -575,6 +577,72 @@ def match_and_filter_columns_by_regex(
575
577
  return filtered_df, matched_columns
576
578
 
577
579
 
580
+ def standardize_percentages(
581
+ df: pd.DataFrame,
582
+ columns: list[str],
583
+ treat_one_as_proportion: bool = True,
584
+ round_digits: int = 2
585
+ ) -> pd.DataFrame:
586
+ """
587
+ Standardizes numeric columns containing mixed-format percentages.
588
+
589
+ This function cleans columns where percentages might be entered as whole
590
+ numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
591
+ between 0 and 1 are proportions and multiplies them by 100.
592
+
593
+ Args:
594
+ df (pd.Dataframe): The input pandas DataFrame.
595
+ columns (list[str]): A list of column names to standardize.
596
+ treat_one_as_proportion (bool):
597
+ - If True (default): The value `1` is treated as a proportion and converted to `100`.
598
+ - If False: The value `1` is treated as `1%`.
599
+ round_digits (int): The number of decimal places to round the final result to.
600
+
601
+ Returns:
602
+ (pd.Dataframe):
603
+ A new DataFrame with the specified columns cleaned and standardized.
604
+ """
605
+ df_copy = df.copy()
606
+
607
+ if df_copy.empty:
608
+ return df_copy
609
+
610
+ # This helper function contains the core cleaning logic
611
+ def _clean_value(x: float) -> float:
612
+ """Applies the standardization rule to a single value."""
613
+ if pd.isna(x):
614
+ return x
615
+
616
+ # If treat_one_as_proportion is True, the range for proportions is [0, 1]
617
+ if treat_one_as_proportion and 0 <= x <= 1:
618
+ return x * 100
619
+ # If False, the range for proportions is [0, 1) (1 is excluded)
620
+ elif not treat_one_as_proportion and 0 <= x < 1:
621
+ return x * 100
622
+
623
+ # Otherwise, the value is assumed to be a correctly formatted percentage
624
+ return x
625
+
626
+ for col in columns:
627
+ # --- Robustness Checks ---
628
+ if col not in df_copy.columns:
629
+ print(f"Warning: Column '{col}' not found. Skipping.")
630
+ continue
631
+
632
+ if not is_numeric_dtype(df_copy[col]):
633
+ print(f"Warning: Column '{col}' is not numeric. Skipping.")
634
+ continue
635
+
636
+ # --- Applying the Logic ---
637
+ # Apply the cleaning function to every value in the column
638
+ df_copy[col] = df_copy[col].apply(_clean_value)
639
+
640
+ # Round the result
641
+ df_copy[col] = df_copy[col].round(round_digits)
642
+
643
+ return df_copy
644
+
645
+
578
646
  def _is_notebook():
579
647
  return get_ipython() is not None
580
648
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "2.2.0"
3
+ version = "2.2.1"
4
4
  description = "A collection of tools for data science and machine learning projects"
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }