dragon-ml-toolbox 20.13.0__py3-none-any.whl → 20.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 20.13.0
3
+ Version: 20.14.0
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- dragon_ml_toolbox-20.13.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-20.13.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
1
+ dragon_ml_toolbox-20.14.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-20.14.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
3
3
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
5
5
  ml_tools/ETL_cleaning/__init__.py,sha256=gLRHF-qzwpqKTvbbn9chIQELeUDh_XGpBRX28j-5IqI,545
@@ -80,7 +80,7 @@ ml_tools/ML_optimization/__init__.py,sha256=No18Dsw6Q9zPt8B9fpG0bWomuXmwDC7Dioki
80
80
  ml_tools/ML_optimization/_multi_dragon.py,sha256=zQhDxFY8FNxUlcbSnHMVArfojzYjgNa21jSE3pJmRW0,38956
81
81
  ml_tools/ML_optimization/_single_dragon.py,sha256=jh5-SK6NKAzbheQhquiYoROozk-RzUv1jiFkIzK_AFg,7288
82
82
  ml_tools/ML_optimization/_single_manual.py,sha256=h-_k9JmRqPkjTra1nu7AyYbSyWkYZ1R3utiNmW06WFs,21809
83
- ml_tools/ML_scaler/_ML_scaler.py,sha256=P75X0Sx8N-VxC2Qy8aG7mWaZlkTfjspiZDi1YiMQD1I,8872
83
+ ml_tools/ML_scaler/_ML_scaler.py,sha256=NcwprqrAHMIKpkzMdExk99I2QpfTSbiJH8rDqmOlnkU,8870
84
84
  ml_tools/ML_scaler/__init__.py,sha256=SHDNyLsoOLl2OtkIb3pGg-JRs3E2bYJBgnHwH3vw_Tk,172
85
85
  ml_tools/ML_trainer/__init__.py,sha256=42kueHa7Z0b_yLbywNCgIxlW6WmgLBqkTFwKH7vFLXw,379
86
86
  ml_tools/ML_trainer/_base_trainer.py,sha256=0ATm672NRsjJ6nv_NEl6-OEd9Bst1-s5OPxfG4qe8Lg,18075
@@ -104,10 +104,10 @@ ml_tools/_core/__init__.py,sha256=m-VP0RW0tOTm9N5NI3kFNcpM7WtVgs0RK9pK3ZJRZQQ,14
104
104
  ml_tools/_core/_logger.py,sha256=xzhn_FouMDRVNwXGBGlPC9Ruq6i5uCrmNaS5jesguMU,4972
105
105
  ml_tools/_core/_schema_load_ops.py,sha256=KLs9vBzANz5ESe2wlP-C41N4VlgGil-ywcfvWKSOGss,1551
106
106
  ml_tools/_core/_script_info.py,sha256=LtFGt10gEvCnhIRMKJPi2yXkiGLcdr7lE-oIP2XGHzQ,234
107
- ml_tools/data_exploration/__init__.py,sha256=efUBsruHL56B429tUadl3PdG73zAF639Y430uMQRfko,1917
108
- ml_tools/data_exploration/_analysis.py,sha256=PJNrEBz5ZZXHoUlQ6fh9Y86nzPQrLpVPv2Ye4NfOxgs,14181
107
+ ml_tools/data_exploration/__init__.py,sha256=XNA8gcRx5ifrv092HA7HSpek8havlk_3RZi9aq9dSjg,1957
108
+ ml_tools/data_exploration/_analysis.py,sha256=JSoFJSkv4-_v9YxxmjHZ_PeFRneDENjSEo2sy_uC4oY,14196
109
109
  ml_tools/data_exploration/_cleaning.py,sha256=pAZOXgGK35j7O8q6cnyTwYK1GLNnD04A8p2fSyMB1mg,20906
110
- ml_tools/data_exploration/_features.py,sha256=Z1noJfDxBzFRfusFp6NlpLF2NItuZuzFHq4ssWFqny4,26273
110
+ ml_tools/data_exploration/_features.py,sha256=twJ6OixU4ItRXA8rPJRfg2N9QVsbn38CFqJiLcXav1A,28664
111
111
  ml_tools/data_exploration/_plotting.py,sha256=zH1dPcIoAlOuww23xIoBCsQOAshPPv9OyGposOA2RvI,19883
112
112
  ml_tools/data_exploration/_schema_ops.py,sha256=Fd6fBGGv4OpxmJ1HG9pith6QL90z0tzssCvzkQxlEEQ,11083
113
113
  ml_tools/ensemble_evaluation/__init__.py,sha256=t4Gr8EGEk8RLatyc92-S0BzbQvdvodzoF-qDAH2qjVg,546
@@ -143,7 +143,7 @@ ml_tools/utilities/__init__.py,sha256=h4lE3SQstg-opcQj6QSKhu-HkqSbmHExsWoM9vC5D9
143
143
  ml_tools/utilities/_translate.py,sha256=U8hRPa3PmTpIf9n9yR3gBGmp_hkcsjQLwjAHSHc0WHs,10325
144
144
  ml_tools/utilities/_utility_save_load.py,sha256=EFvFaTaHahDQWdJWZr-j7cHqRbG_Xrpc96228JhV-bs,16773
145
145
  ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
146
- dragon_ml_toolbox-20.13.0.dist-info/METADATA,sha256=bTnTpMlvOFu2IlYpmc0QphbYeqbslxzptluUbEWaO-s,7889
147
- dragon_ml_toolbox-20.13.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
148
- dragon_ml_toolbox-20.13.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
149
- dragon_ml_toolbox-20.13.0.dist-info/RECORD,,
146
+ dragon_ml_toolbox-20.14.0.dist-info/METADATA,sha256=32IleSQa7t7E42ZB5rM32Lf1MlSAMtKkU-TFky3VckA,7889
147
+ dragon_ml_toolbox-20.14.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
148
+ dragon_ml_toolbox-20.14.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
149
+ dragon_ml_toolbox-20.14.0.dist-info/RECORD,,
@@ -99,7 +99,7 @@ class DragonScaler:
99
99
  std = torch.sqrt(torch.clamp(variance, min=1e-8))
100
100
 
101
101
  if verbose >= 2:
102
- _LOGGER.info(f"Scaler fitted on {n_total} samples for {num_continuous_features} features (Welford's).")
102
+ _LOGGER.info(f"Scaler fitted on {n_total} samples for {num_continuous_features} columns (Welford's).")
103
103
  return cls(mean=mean_global, std=std, continuous_feature_indices=continuous_feature_indices)
104
104
 
105
105
  @classmethod
@@ -121,7 +121,7 @@ class DragonScaler:
121
121
  std = torch.where(std == 0, torch.tensor(1.0, device=data.device), std)
122
122
 
123
123
  if verbose >= 2:
124
- _LOGGER.info(f"Scaler fitted on tensor with {data.shape[0]} samples for {num_features} features.")
124
+ _LOGGER.info(f"Scaler fitted on tensor with {data.shape[0]} samples for {num_features} columns.")
125
125
 
126
126
  return cls(mean=mean, std=std, continuous_feature_indices=indices)
127
127
 
@@ -33,6 +33,7 @@ from ._features import (
33
33
  reconstruct_one_hot,
34
34
  reconstruct_binary,
35
35
  reconstruct_multibinary,
36
+ filter_subset,
36
37
  )
37
38
 
38
39
  from ._schema_ops import (
@@ -51,6 +52,7 @@ __all__ = [
51
52
  "drop_columns_with_missing_data",
52
53
  "drop_macro",
53
54
  "clean_column_names",
55
+ "filter_subset",
54
56
  "plot_value_distributions",
55
57
  "split_features_targets",
56
58
  "split_continuous_binary",
@@ -34,7 +34,7 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
34
34
  """
35
35
  summary = pd.DataFrame({
36
36
  'Data Type': df.dtypes,
37
- 'Completeness %': (df.notnull().mean() * 100).round(2),
37
+ 'Completeness %': (df.notnull().mean() * 100).round(2), # type: ignore
38
38
  'Unique Values': df.nunique(),
39
39
  # 'Missing %': (df.isnull().mean() * 100).round(2)
40
40
  })
@@ -657,3 +657,66 @@ def reconstruct_multibinary(
657
657
  _LOGGER.info(f"Reconstructed {converted_count} binary columns matching '{pattern}'.")
658
658
 
659
659
  return new_df, target_columns
660
+
661
+
662
+ def filter_subset(
663
+ df: pd.DataFrame,
664
+ filters: Union[dict[str, Any], dict[str, list[Any]]],
665
+ drop_filter_cols: bool = True,
666
+ reset_index: bool = True,
667
+ verbose: int = 3
668
+ ) -> pd.DataFrame:
669
+ """
670
+ Filters the DataFrame based on a dictionary of column-value conditions.
671
+
672
+ Supports:
673
+ - Single value matching (e.g., {"Color": "Blue"})
674
+ - Multiple value matching (e.g., {"Color": ["Blue", "Red"]}) -> OR logic within column.
675
+ - Multiple column filtering (e.g., {"Color": "Blue", "Size": "Large"}) -> AND logic between columns.
676
+
677
+ Args:
678
+ df (pd.DataFrame): Input DataFrame.
679
+ filters (dict[str, Any] | dict[str, list[Any]]): Dictionary where keys are column names and values are the target values (scalar or list).
680
+ drop_filter_cols (bool): If True, drops the columns used for filtering from the result.
681
+ reset_index (bool): If True, resets the index of the resulting DataFrame.
682
+ verbose (int): Verbosity level.
683
+
684
+ Returns:
685
+ pd.DataFrame: The filtered DataFrame.
686
+ """
687
+ df_filtered = df.copy()
688
+
689
+ # Validate columns exist
690
+ missing_cols = [col for col in filters.keys() if col not in df.columns]
691
+ if missing_cols:
692
+ _LOGGER.error(f"Filter columns not found: {missing_cols}")
693
+ raise ValueError()
694
+
695
+ if verbose >= 2:
696
+ _LOGGER.info(f"Original shape: {df.shape}")
697
+
698
+ for col, value in filters.items():
699
+ # Handle list of values (OR logic within column)
700
+ if isinstance(value, list):
701
+ df_filtered = df_filtered[df_filtered[col].isin(value)]
702
+ # Handle single value
703
+ else:
704
+ # Warn if the value is a floating point due to potential precision issues
705
+ if isinstance(value, float) and verbose >= 1:
706
+ _LOGGER.warning(f"Filtering on column '{col}' with float value '{value}'.")
707
+ df_filtered = df_filtered[df_filtered[col] == value]
708
+
709
+ if drop_filter_cols:
710
+ if verbose >= 3:
711
+ _LOGGER.info(f"Dropping filter columns: {list(filters.keys())}")
712
+ df_filtered.drop(columns=list(filters.keys()), inplace=True)
713
+
714
+ if reset_index:
715
+ if verbose >= 3:
716
+ _LOGGER.info("Resetting index of the filtered DataFrame.")
717
+ df_filtered.reset_index(drop=True, inplace=True)
718
+
719
+ if verbose >= 2:
720
+ _LOGGER.info(f"Filtered shape: {df_filtered.shape}")
721
+
722
+ return df_filtered