dragon-ml-toolbox 20.13.0__py3-none-any.whl → 20.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.0.dist-info}/RECORD +10 -10
- ml_tools/ML_scaler/_ML_scaler.py +2 -2
- ml_tools/data_exploration/__init__.py +2 -0
- ml_tools/data_exploration/_analysis.py +1 -1
- ml_tools/data_exploration/_features.py +63 -0
- {dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 20.
|
|
3
|
+
Version: 20.14.0
|
|
4
4
|
Summary: Complete pipelines and helper tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
dragon_ml_toolbox-20.
|
|
2
|
-
dragon_ml_toolbox-20.
|
|
1
|
+
dragon_ml_toolbox-20.14.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-20.14.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
|
|
3
3
|
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
5
5
|
ml_tools/ETL_cleaning/__init__.py,sha256=gLRHF-qzwpqKTvbbn9chIQELeUDh_XGpBRX28j-5IqI,545
|
|
@@ -80,7 +80,7 @@ ml_tools/ML_optimization/__init__.py,sha256=No18Dsw6Q9zPt8B9fpG0bWomuXmwDC7Dioki
|
|
|
80
80
|
ml_tools/ML_optimization/_multi_dragon.py,sha256=zQhDxFY8FNxUlcbSnHMVArfojzYjgNa21jSE3pJmRW0,38956
|
|
81
81
|
ml_tools/ML_optimization/_single_dragon.py,sha256=jh5-SK6NKAzbheQhquiYoROozk-RzUv1jiFkIzK_AFg,7288
|
|
82
82
|
ml_tools/ML_optimization/_single_manual.py,sha256=h-_k9JmRqPkjTra1nu7AyYbSyWkYZ1R3utiNmW06WFs,21809
|
|
83
|
-
ml_tools/ML_scaler/_ML_scaler.py,sha256=
|
|
83
|
+
ml_tools/ML_scaler/_ML_scaler.py,sha256=NcwprqrAHMIKpkzMdExk99I2QpfTSbiJH8rDqmOlnkU,8870
|
|
84
84
|
ml_tools/ML_scaler/__init__.py,sha256=SHDNyLsoOLl2OtkIb3pGg-JRs3E2bYJBgnHwH3vw_Tk,172
|
|
85
85
|
ml_tools/ML_trainer/__init__.py,sha256=42kueHa7Z0b_yLbywNCgIxlW6WmgLBqkTFwKH7vFLXw,379
|
|
86
86
|
ml_tools/ML_trainer/_base_trainer.py,sha256=0ATm672NRsjJ6nv_NEl6-OEd9Bst1-s5OPxfG4qe8Lg,18075
|
|
@@ -104,10 +104,10 @@ ml_tools/_core/__init__.py,sha256=m-VP0RW0tOTm9N5NI3kFNcpM7WtVgs0RK9pK3ZJRZQQ,14
|
|
|
104
104
|
ml_tools/_core/_logger.py,sha256=xzhn_FouMDRVNwXGBGlPC9Ruq6i5uCrmNaS5jesguMU,4972
|
|
105
105
|
ml_tools/_core/_schema_load_ops.py,sha256=KLs9vBzANz5ESe2wlP-C41N4VlgGil-ywcfvWKSOGss,1551
|
|
106
106
|
ml_tools/_core/_script_info.py,sha256=LtFGt10gEvCnhIRMKJPi2yXkiGLcdr7lE-oIP2XGHzQ,234
|
|
107
|
-
ml_tools/data_exploration/__init__.py,sha256=
|
|
108
|
-
ml_tools/data_exploration/_analysis.py,sha256=
|
|
107
|
+
ml_tools/data_exploration/__init__.py,sha256=XNA8gcRx5ifrv092HA7HSpek8havlk_3RZi9aq9dSjg,1957
|
|
108
|
+
ml_tools/data_exploration/_analysis.py,sha256=JSoFJSkv4-_v9YxxmjHZ_PeFRneDENjSEo2sy_uC4oY,14196
|
|
109
109
|
ml_tools/data_exploration/_cleaning.py,sha256=pAZOXgGK35j7O8q6cnyTwYK1GLNnD04A8p2fSyMB1mg,20906
|
|
110
|
-
ml_tools/data_exploration/_features.py,sha256=
|
|
110
|
+
ml_tools/data_exploration/_features.py,sha256=twJ6OixU4ItRXA8rPJRfg2N9QVsbn38CFqJiLcXav1A,28664
|
|
111
111
|
ml_tools/data_exploration/_plotting.py,sha256=zH1dPcIoAlOuww23xIoBCsQOAshPPv9OyGposOA2RvI,19883
|
|
112
112
|
ml_tools/data_exploration/_schema_ops.py,sha256=Fd6fBGGv4OpxmJ1HG9pith6QL90z0tzssCvzkQxlEEQ,11083
|
|
113
113
|
ml_tools/ensemble_evaluation/__init__.py,sha256=t4Gr8EGEk8RLatyc92-S0BzbQvdvodzoF-qDAH2qjVg,546
|
|
@@ -143,7 +143,7 @@ ml_tools/utilities/__init__.py,sha256=h4lE3SQstg-opcQj6QSKhu-HkqSbmHExsWoM9vC5D9
|
|
|
143
143
|
ml_tools/utilities/_translate.py,sha256=U8hRPa3PmTpIf9n9yR3gBGmp_hkcsjQLwjAHSHc0WHs,10325
|
|
144
144
|
ml_tools/utilities/_utility_save_load.py,sha256=EFvFaTaHahDQWdJWZr-j7cHqRbG_Xrpc96228JhV-bs,16773
|
|
145
145
|
ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
|
|
146
|
-
dragon_ml_toolbox-20.
|
|
147
|
-
dragon_ml_toolbox-20.
|
|
148
|
-
dragon_ml_toolbox-20.
|
|
149
|
-
dragon_ml_toolbox-20.
|
|
146
|
+
dragon_ml_toolbox-20.14.0.dist-info/METADATA,sha256=32IleSQa7t7E42ZB5rM32Lf1MlSAMtKkU-TFky3VckA,7889
|
|
147
|
+
dragon_ml_toolbox-20.14.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
148
|
+
dragon_ml_toolbox-20.14.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
149
|
+
dragon_ml_toolbox-20.14.0.dist-info/RECORD,,
|
ml_tools/ML_scaler/_ML_scaler.py
CHANGED
|
@@ -99,7 +99,7 @@ class DragonScaler:
|
|
|
99
99
|
std = torch.sqrt(torch.clamp(variance, min=1e-8))
|
|
100
100
|
|
|
101
101
|
if verbose >= 2:
|
|
102
|
-
_LOGGER.info(f"Scaler fitted on {n_total} samples for {num_continuous_features}
|
|
102
|
+
_LOGGER.info(f"Scaler fitted on {n_total} samples for {num_continuous_features} columns (Welford's).")
|
|
103
103
|
return cls(mean=mean_global, std=std, continuous_feature_indices=continuous_feature_indices)
|
|
104
104
|
|
|
105
105
|
@classmethod
|
|
@@ -121,7 +121,7 @@ class DragonScaler:
|
|
|
121
121
|
std = torch.where(std == 0, torch.tensor(1.0, device=data.device), std)
|
|
122
122
|
|
|
123
123
|
if verbose >= 2:
|
|
124
|
-
_LOGGER.info(f"Scaler fitted on tensor with {data.shape[0]} samples for {num_features}
|
|
124
|
+
_LOGGER.info(f"Scaler fitted on tensor with {data.shape[0]} samples for {num_features} columns.")
|
|
125
125
|
|
|
126
126
|
return cls(mean=mean, std=std, continuous_feature_indices=indices)
|
|
127
127
|
|
|
@@ -33,6 +33,7 @@ from ._features import (
|
|
|
33
33
|
reconstruct_one_hot,
|
|
34
34
|
reconstruct_binary,
|
|
35
35
|
reconstruct_multibinary,
|
|
36
|
+
filter_subset,
|
|
36
37
|
)
|
|
37
38
|
|
|
38
39
|
from ._schema_ops import (
|
|
@@ -51,6 +52,7 @@ __all__ = [
|
|
|
51
52
|
"drop_columns_with_missing_data",
|
|
52
53
|
"drop_macro",
|
|
53
54
|
"clean_column_names",
|
|
55
|
+
"filter_subset",
|
|
54
56
|
"plot_value_distributions",
|
|
55
57
|
"split_features_targets",
|
|
56
58
|
"split_continuous_binary",
|
|
@@ -34,7 +34,7 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
|
34
34
|
"""
|
|
35
35
|
summary = pd.DataFrame({
|
|
36
36
|
'Data Type': df.dtypes,
|
|
37
|
-
'Completeness %': (df.notnull().mean() * 100).round(2),
|
|
37
|
+
'Completeness %': (df.notnull().mean() * 100).round(2), # type: ignore
|
|
38
38
|
'Unique Values': df.nunique(),
|
|
39
39
|
# 'Missing %': (df.isnull().mean() * 100).round(2)
|
|
40
40
|
})
|
|
@@ -657,3 +657,66 @@ def reconstruct_multibinary(
|
|
|
657
657
|
_LOGGER.info(f"Reconstructed {converted_count} binary columns matching '{pattern}'.")
|
|
658
658
|
|
|
659
659
|
return new_df, target_columns
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def filter_subset(
|
|
663
|
+
df: pd.DataFrame,
|
|
664
|
+
filters: Union[dict[str, Any], dict[str, list[Any]]],
|
|
665
|
+
drop_filter_cols: bool = True,
|
|
666
|
+
reset_index: bool = True,
|
|
667
|
+
verbose: int = 3
|
|
668
|
+
) -> pd.DataFrame:
|
|
669
|
+
"""
|
|
670
|
+
Filters the DataFrame based on a dictionary of column-value conditions.
|
|
671
|
+
|
|
672
|
+
Supports:
|
|
673
|
+
- Single value matching (e.g., {"Color": "Blue"})
|
|
674
|
+
- Multiple value matching (e.g., {"Color": ["Blue", "Red"]}) -> OR logic within column.
|
|
675
|
+
- Multiple column filtering (e.g., {"Color": "Blue", "Size": "Large"}) -> AND logic between columns.
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
df (pd.DataFrame): Input DataFrame.
|
|
679
|
+
filters (dict[str, Any] | dict[str, list[Any]]): Dictionary where keys are column names and values are the target values (scalar or list).
|
|
680
|
+
drop_filter_cols (bool): If True, drops the columns used for filtering from the result.
|
|
681
|
+
reset_index (bool): If True, resets the index of the resulting DataFrame.
|
|
682
|
+
verbose (int): Verbosity level.
|
|
683
|
+
|
|
684
|
+
Returns:
|
|
685
|
+
pd.DataFrame: The filtered DataFrame.
|
|
686
|
+
"""
|
|
687
|
+
df_filtered = df.copy()
|
|
688
|
+
|
|
689
|
+
# Validate columns exist
|
|
690
|
+
missing_cols = [col for col in filters.keys() if col not in df.columns]
|
|
691
|
+
if missing_cols:
|
|
692
|
+
_LOGGER.error(f"Filter columns not found: {missing_cols}")
|
|
693
|
+
raise ValueError()
|
|
694
|
+
|
|
695
|
+
if verbose >= 2:
|
|
696
|
+
_LOGGER.info(f"Original shape: {df.shape}")
|
|
697
|
+
|
|
698
|
+
for col, value in filters.items():
|
|
699
|
+
# Handle list of values (OR logic within column)
|
|
700
|
+
if isinstance(value, list):
|
|
701
|
+
df_filtered = df_filtered[df_filtered[col].isin(value)]
|
|
702
|
+
# Handle single value
|
|
703
|
+
else:
|
|
704
|
+
# Warn if the value is a floating point due to potential precision issues
|
|
705
|
+
if isinstance(value, float) and verbose >= 1:
|
|
706
|
+
_LOGGER.warning(f"Filtering on column '{col}' with float value '{value}'.")
|
|
707
|
+
df_filtered = df_filtered[df_filtered[col] == value]
|
|
708
|
+
|
|
709
|
+
if drop_filter_cols:
|
|
710
|
+
if verbose >= 3:
|
|
711
|
+
_LOGGER.info(f"Dropping filter columns: {list(filters.keys())}")
|
|
712
|
+
df_filtered.drop(columns=list(filters.keys()), inplace=True)
|
|
713
|
+
|
|
714
|
+
if reset_index:
|
|
715
|
+
if verbose >= 3:
|
|
716
|
+
_LOGGER.info("Resetting index of the filtered DataFrame.")
|
|
717
|
+
df_filtered.reset_index(drop=True, inplace=True)
|
|
718
|
+
|
|
719
|
+
if verbose >= 2:
|
|
720
|
+
_LOGGER.info(f"Filtered shape: {df_filtered.shape}")
|
|
721
|
+
|
|
722
|
+
return df_filtered
|
|
File without changes
|
{dragon_ml_toolbox-20.13.0.dist-info → dragon_ml_toolbox-20.14.0.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|