dragon-ml-toolbox 2.2.0__tar.gz → 2.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-2.2.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-2.2.1}/PKG-INFO +1 -1
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/ETL_engineering.py +61 -3
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/data_exploration.py +69 -1
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/pyproject.toml +1 -1
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/LICENSE +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/README.md +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/_particle_swarm_optimization.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/datasetmaster.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/logger.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/pytorch_models.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/trainer.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/vision_helpers.py +0 -0
- {dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/setup.cfg +0 -0
|
@@ -101,7 +101,7 @@ class DataProcessor:
|
|
|
101
101
|
raise TypeError("The recipe must be an instance of TransformationRecipe.")
|
|
102
102
|
if len(recipe) == 0:
|
|
103
103
|
raise ValueError("The recipe cannot be empty.")
|
|
104
|
-
self.
|
|
104
|
+
self._recipe = recipe
|
|
105
105
|
|
|
106
106
|
def transform(self, df: pl.DataFrame) -> pl.DataFrame:
|
|
107
107
|
"""
|
|
@@ -109,7 +109,7 @@ class DataProcessor:
|
|
|
109
109
|
"""
|
|
110
110
|
processed_columns = []
|
|
111
111
|
# Recipe object is iterable
|
|
112
|
-
for step in self.
|
|
112
|
+
for step in self._recipe:
|
|
113
113
|
input_col_name = step["input_col"]
|
|
114
114
|
output_col_spec = step["output_col"]
|
|
115
115
|
transform_action = step["transform"]
|
|
@@ -154,6 +154,49 @@ class DataProcessor:
|
|
|
154
154
|
return pl.DataFrame()
|
|
155
155
|
|
|
156
156
|
return pl.DataFrame(processed_columns)
|
|
157
|
+
|
|
158
|
+
def __str__(self) -> str:
|
|
159
|
+
"""
|
|
160
|
+
Provides a detailed, human-readable string representation of the
|
|
161
|
+
entire processing pipeline.
|
|
162
|
+
"""
|
|
163
|
+
header = "DataProcessor Pipeline"
|
|
164
|
+
divider = "-" * len(header)
|
|
165
|
+
num_steps = len(self._recipe)
|
|
166
|
+
|
|
167
|
+
lines = [
|
|
168
|
+
header,
|
|
169
|
+
divider,
|
|
170
|
+
f"Number of steps: {num_steps}\n"
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
if num_steps == 0:
|
|
174
|
+
lines.append("No transformation steps defined.")
|
|
175
|
+
return "\n".join(lines)
|
|
176
|
+
|
|
177
|
+
for i, step in enumerate(self._recipe, 1):
|
|
178
|
+
transform_action = step["transform"]
|
|
179
|
+
|
|
180
|
+
# Get a clean name for the transformation action
|
|
181
|
+
if transform_action == _RENAME: # "rename"
|
|
182
|
+
transform_name = "Rename"
|
|
183
|
+
else:
|
|
184
|
+
# This works for both functions and class instances
|
|
185
|
+
transform_name = type(transform_action).__name__
|
|
186
|
+
|
|
187
|
+
lines.append(f"[{i}] Input: '{step['input_col']}'")
|
|
188
|
+
lines.append(f" - Transform: {transform_name}")
|
|
189
|
+
lines.append(f" - Output(s): {step['output_col']}")
|
|
190
|
+
if i < num_steps:
|
|
191
|
+
lines.append("") # Add a blank line between steps
|
|
192
|
+
|
|
193
|
+
return "\n".join(lines)
|
|
194
|
+
|
|
195
|
+
def inspect(self) -> None:
|
|
196
|
+
"""
|
|
197
|
+
Prints the detailed string representation of the pipeline to the console.
|
|
198
|
+
"""
|
|
199
|
+
print(self)
|
|
157
200
|
|
|
158
201
|
|
|
159
202
|
class KeywordDummifier:
|
|
@@ -407,7 +450,22 @@ class CategoryMapper:
|
|
|
407
450
|
pl.Series: A new Series with categories mapped to numbers.
|
|
408
451
|
"""
|
|
409
452
|
# Ensure the column is treated as a string for matching keys
|
|
410
|
-
|
|
453
|
+
str_column = column.cast(pl.Utf8)
|
|
454
|
+
|
|
455
|
+
# Create a list of 'when/then' expressions, one for each mapping
|
|
456
|
+
mapping_expressions = [
|
|
457
|
+
pl.when(str_column == from_val).then(pl.lit(to_val))
|
|
458
|
+
for from_val, to_val in self.mapping.items()
|
|
459
|
+
]
|
|
460
|
+
|
|
461
|
+
# Use coalesce to find the first non-null value.
|
|
462
|
+
# The default_value acts as the final fallback.
|
|
463
|
+
final_expr = pl.coalesce(
|
|
464
|
+
*mapping_expressions, # Unpack the list of expressions
|
|
465
|
+
pl.lit(self.default_value)
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
return pl.select(final_expr).to_series()
|
|
411
469
|
|
|
412
470
|
|
|
413
471
|
class ValueBinner:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
+
from pandas.api.types import is_numeric_dtype
|
|
2
3
|
import numpy as np
|
|
3
4
|
import matplotlib.pyplot as plt
|
|
4
5
|
import seaborn as sns
|
|
@@ -24,7 +25,8 @@ __all__ = [
|
|
|
24
25
|
"plot_value_distributions",
|
|
25
26
|
"clip_outliers_single",
|
|
26
27
|
"clip_outliers_multi",
|
|
27
|
-
"match_and_filter_columns_by_regex"
|
|
28
|
+
"match_and_filter_columns_by_regex",
|
|
29
|
+
"standardize_percentages"
|
|
28
30
|
]
|
|
29
31
|
|
|
30
32
|
|
|
@@ -575,6 +577,72 @@ def match_and_filter_columns_by_regex(
|
|
|
575
577
|
return filtered_df, matched_columns
|
|
576
578
|
|
|
577
579
|
|
|
580
|
+
def standardize_percentages(
|
|
581
|
+
df: pd.DataFrame,
|
|
582
|
+
columns: list[str],
|
|
583
|
+
treat_one_as_proportion: bool = True,
|
|
584
|
+
round_digits: int = 2
|
|
585
|
+
) -> pd.DataFrame:
|
|
586
|
+
"""
|
|
587
|
+
Standardizes numeric columns containing mixed-format percentages.
|
|
588
|
+
|
|
589
|
+
This function cleans columns where percentages might be entered as whole
|
|
590
|
+
numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
|
|
591
|
+
between 0 and 1 are proportions and multiplies them by 100.
|
|
592
|
+
|
|
593
|
+
Args:
|
|
594
|
+
df (pd.Dataframe): The input pandas DataFrame.
|
|
595
|
+
columns (list[str]): A list of column names to standardize.
|
|
596
|
+
treat_one_as_proportion (bool):
|
|
597
|
+
- If True (default): The value `1` is treated as a proportion and converted to `100`.
|
|
598
|
+
- If False: The value `1` is treated as `1%`.
|
|
599
|
+
round_digits (int): The number of decimal places to round the final result to.
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
(pd.Dataframe):
|
|
603
|
+
A new DataFrame with the specified columns cleaned and standardized.
|
|
604
|
+
"""
|
|
605
|
+
df_copy = df.copy()
|
|
606
|
+
|
|
607
|
+
if df_copy.empty:
|
|
608
|
+
return df_copy
|
|
609
|
+
|
|
610
|
+
# This helper function contains the core cleaning logic
|
|
611
|
+
def _clean_value(x: float) -> float:
|
|
612
|
+
"""Applies the standardization rule to a single value."""
|
|
613
|
+
if pd.isna(x):
|
|
614
|
+
return x
|
|
615
|
+
|
|
616
|
+
# If treat_one_as_proportion is True, the range for proportions is [0, 1]
|
|
617
|
+
if treat_one_as_proportion and 0 <= x <= 1:
|
|
618
|
+
return x * 100
|
|
619
|
+
# If False, the range for proportions is [0, 1) (1 is excluded)
|
|
620
|
+
elif not treat_one_as_proportion and 0 <= x < 1:
|
|
621
|
+
return x * 100
|
|
622
|
+
|
|
623
|
+
# Otherwise, the value is assumed to be a correctly formatted percentage
|
|
624
|
+
return x
|
|
625
|
+
|
|
626
|
+
for col in columns:
|
|
627
|
+
# --- Robustness Checks ---
|
|
628
|
+
if col not in df_copy.columns:
|
|
629
|
+
print(f"Warning: Column '{col}' not found. Skipping.")
|
|
630
|
+
continue
|
|
631
|
+
|
|
632
|
+
if not is_numeric_dtype(df_copy[col]):
|
|
633
|
+
print(f"Warning: Column '{col}' is not numeric. Skipping.")
|
|
634
|
+
continue
|
|
635
|
+
|
|
636
|
+
# --- Applying the Logic ---
|
|
637
|
+
# Apply the cleaning function to every value in the column
|
|
638
|
+
df_copy[col] = df_copy[col].apply(_clean_value)
|
|
639
|
+
|
|
640
|
+
# Round the result
|
|
641
|
+
df_copy[col] = df_copy[col].round(round_digits)
|
|
642
|
+
|
|
643
|
+
return df_copy
|
|
644
|
+
|
|
645
|
+
|
|
578
646
|
def _is_notebook():
|
|
579
647
|
return get_ipython() is not None
|
|
580
648
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/dragon_ml_toolbox.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/_particle_swarm_optimization.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|