dragon-ml-toolbox 20.13.0__py3-none-any.whl → 20.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 20.13.0
3
+ Version: 20.14.1
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- dragon_ml_toolbox-20.13.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-20.13.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
1
+ dragon_ml_toolbox-20.14.1.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-20.14.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
3
3
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
5
5
  ml_tools/ETL_cleaning/__init__.py,sha256=gLRHF-qzwpqKTvbbn9chIQELeUDh_XGpBRX28j-5IqI,545
@@ -80,7 +80,7 @@ ml_tools/ML_optimization/__init__.py,sha256=No18Dsw6Q9zPt8B9fpG0bWomuXmwDC7Dioki
80
80
  ml_tools/ML_optimization/_multi_dragon.py,sha256=zQhDxFY8FNxUlcbSnHMVArfojzYjgNa21jSE3pJmRW0,38956
81
81
  ml_tools/ML_optimization/_single_dragon.py,sha256=jh5-SK6NKAzbheQhquiYoROozk-RzUv1jiFkIzK_AFg,7288
82
82
  ml_tools/ML_optimization/_single_manual.py,sha256=h-_k9JmRqPkjTra1nu7AyYbSyWkYZ1R3utiNmW06WFs,21809
83
- ml_tools/ML_scaler/_ML_scaler.py,sha256=P75X0Sx8N-VxC2Qy8aG7mWaZlkTfjspiZDi1YiMQD1I,8872
83
+ ml_tools/ML_scaler/_ML_scaler.py,sha256=NcwprqrAHMIKpkzMdExk99I2QpfTSbiJH8rDqmOlnkU,8870
84
84
  ml_tools/ML_scaler/__init__.py,sha256=SHDNyLsoOLl2OtkIb3pGg-JRs3E2bYJBgnHwH3vw_Tk,172
85
85
  ml_tools/ML_trainer/__init__.py,sha256=42kueHa7Z0b_yLbywNCgIxlW6WmgLBqkTFwKH7vFLXw,379
86
86
  ml_tools/ML_trainer/_base_trainer.py,sha256=0ATm672NRsjJ6nv_NEl6-OEd9Bst1-s5OPxfG4qe8Lg,18075
@@ -104,11 +104,11 @@ ml_tools/_core/__init__.py,sha256=m-VP0RW0tOTm9N5NI3kFNcpM7WtVgs0RK9pK3ZJRZQQ,14
104
104
  ml_tools/_core/_logger.py,sha256=xzhn_FouMDRVNwXGBGlPC9Ruq6i5uCrmNaS5jesguMU,4972
105
105
  ml_tools/_core/_schema_load_ops.py,sha256=KLs9vBzANz5ESe2wlP-C41N4VlgGil-ywcfvWKSOGss,1551
106
106
  ml_tools/_core/_script_info.py,sha256=LtFGt10gEvCnhIRMKJPi2yXkiGLcdr7lE-oIP2XGHzQ,234
107
- ml_tools/data_exploration/__init__.py,sha256=efUBsruHL56B429tUadl3PdG73zAF639Y430uMQRfko,1917
108
- ml_tools/data_exploration/_analysis.py,sha256=PJNrEBz5ZZXHoUlQ6fh9Y86nzPQrLpVPv2Ye4NfOxgs,14181
107
+ ml_tools/data_exploration/__init__.py,sha256=XNA8gcRx5ifrv092HA7HSpek8havlk_3RZi9aq9dSjg,1957
108
+ ml_tools/data_exploration/_analysis.py,sha256=JSoFJSkv4-_v9YxxmjHZ_PeFRneDENjSEo2sy_uC4oY,14196
109
109
  ml_tools/data_exploration/_cleaning.py,sha256=pAZOXgGK35j7O8q6cnyTwYK1GLNnD04A8p2fSyMB1mg,20906
110
- ml_tools/data_exploration/_features.py,sha256=Z1noJfDxBzFRfusFp6NlpLF2NItuZuzFHq4ssWFqny4,26273
111
- ml_tools/data_exploration/_plotting.py,sha256=zH1dPcIoAlOuww23xIoBCsQOAshPPv9OyGposOA2RvI,19883
110
+ ml_tools/data_exploration/_features.py,sha256=_aBMW7eqSm6oUj54ftidsv9zdywOkc1eyZgITb82XF8,29237
111
+ ml_tools/data_exploration/_plotting.py,sha256=Vg9qS46akbAyrZAgBrPWg2p29V5vqqY4Bk4SHwZLZNI,19995
112
112
  ml_tools/data_exploration/_schema_ops.py,sha256=Fd6fBGGv4OpxmJ1HG9pith6QL90z0tzssCvzkQxlEEQ,11083
113
113
  ml_tools/ensemble_evaluation/__init__.py,sha256=t4Gr8EGEk8RLatyc92-S0BzbQvdvodzoF-qDAH2qjVg,546
114
114
  ml_tools/ensemble_evaluation/_ensemble_evaluation.py,sha256=-sX9cLMaa0FOQDikmVv2lsCYtQ56Kftd3tILnNej0Hg,28346
@@ -143,7 +143,7 @@ ml_tools/utilities/__init__.py,sha256=h4lE3SQstg-opcQj6QSKhu-HkqSbmHExsWoM9vC5D9
143
143
  ml_tools/utilities/_translate.py,sha256=U8hRPa3PmTpIf9n9yR3gBGmp_hkcsjQLwjAHSHc0WHs,10325
144
144
  ml_tools/utilities/_utility_save_load.py,sha256=EFvFaTaHahDQWdJWZr-j7cHqRbG_Xrpc96228JhV-bs,16773
145
145
  ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
146
- dragon_ml_toolbox-20.13.0.dist-info/METADATA,sha256=bTnTpMlvOFu2IlYpmc0QphbYeqbslxzptluUbEWaO-s,7889
147
- dragon_ml_toolbox-20.13.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
148
- dragon_ml_toolbox-20.13.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
149
- dragon_ml_toolbox-20.13.0.dist-info/RECORD,,
146
+ dragon_ml_toolbox-20.14.1.dist-info/METADATA,sha256=oV6v5gFhRVLpuJ3HgL7Qpn8_Dgk9DGkYcOjSfl2kIh0,7889
147
+ dragon_ml_toolbox-20.14.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
148
+ dragon_ml_toolbox-20.14.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
149
+ dragon_ml_toolbox-20.14.1.dist-info/RECORD,,
@@ -99,7 +99,7 @@ class DragonScaler:
99
99
  std = torch.sqrt(torch.clamp(variance, min=1e-8))
100
100
 
101
101
  if verbose >= 2:
102
- _LOGGER.info(f"Scaler fitted on {n_total} samples for {num_continuous_features} features (Welford's).")
102
+ _LOGGER.info(f"Scaler fitted on {n_total} samples for {num_continuous_features} columns (Welford's).")
103
103
  return cls(mean=mean_global, std=std, continuous_feature_indices=continuous_feature_indices)
104
104
 
105
105
  @classmethod
@@ -121,7 +121,7 @@ class DragonScaler:
121
121
  std = torch.where(std == 0, torch.tensor(1.0, device=data.device), std)
122
122
 
123
123
  if verbose >= 2:
124
- _LOGGER.info(f"Scaler fitted on tensor with {data.shape[0]} samples for {num_features} features.")
124
+ _LOGGER.info(f"Scaler fitted on tensor with {data.shape[0]} samples for {num_features} columns.")
125
125
 
126
126
  return cls(mean=mean, std=std, continuous_feature_indices=indices)
127
127
 
@@ -33,6 +33,7 @@ from ._features import (
33
33
  reconstruct_one_hot,
34
34
  reconstruct_binary,
35
35
  reconstruct_multibinary,
36
+ filter_subset,
36
37
  )
37
38
 
38
39
  from ._schema_ops import (
@@ -51,6 +52,7 @@ __all__ = [
51
52
  "drop_columns_with_missing_data",
52
53
  "drop_macro",
53
54
  "clean_column_names",
55
+ "filter_subset",
54
56
  "plot_value_distributions",
55
57
  "split_features_targets",
56
58
  "split_continuous_binary",
@@ -34,7 +34,7 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
34
34
  """
35
35
  summary = pd.DataFrame({
36
36
  'Data Type': df.dtypes,
37
- 'Completeness %': (df.notnull().mean() * 100).round(2),
37
+ 'Completeness %': (df.notnull().mean() * 100).round(2), # type: ignore
38
38
  'Unique Values': df.nunique(),
39
39
  # 'Missing %': (df.isnull().mean() * 100).round(2)
40
40
  })
@@ -168,6 +168,13 @@ def split_continuous_categorical_targets(
168
168
  f" - Categorical: {df_categorical.shape}\n"
169
169
  f" - Targets: {df_targets.shape}"
170
170
  )
171
+
172
+ if isinstance(df_continuous, pd.Series):
173
+ df_continuous = df_continuous.to_frame()
174
+ if isinstance(df_categorical, pd.Series):
175
+ df_categorical = df_categorical.to_frame()
176
+ if isinstance(df_targets, pd.Series):
177
+ df_targets = df_targets.to_frame()
171
178
 
172
179
  return df_continuous, df_categorical, df_targets
173
180
 
@@ -271,6 +278,7 @@ def encode_classification_target(
271
278
  df: pd.DataFrame,
272
279
  target_col: str,
273
280
  save_dir: Union[str, Path],
281
+ suffix: str = "",
274
282
  verbose: int = 2
275
283
  ) -> tuple[pd.DataFrame, dict[str, int]]:
276
284
  """
@@ -283,6 +291,7 @@ def encode_classification_target(
283
291
  df (pd.DataFrame): Input DataFrame.
284
292
  target_col (str): Name of the target column to encode.
285
293
  save_dir (str | Path): Directory where the class map JSON will be saved.
294
+ suffix (str): Suffix to append to the class map filename.
286
295
  verbose (int): Verbosity level for logging.
287
296
 
288
297
  Returns:
@@ -300,9 +309,17 @@ def encode_classification_target(
300
309
  _LOGGER.error(f"Target column '{target_col}' contains {n_missing} missing values. Please handle them before encoding.")
301
310
  raise ValueError()
302
311
 
312
+ # validate suffix and prepend underscore if needed
313
+ if suffix:
314
+ if not suffix.startswith("_"):
315
+ suffix = f"_{suffix}"
316
+ sanitized_suffix = suffix
317
+ else:
318
+ sanitized_suffix = ''
319
+
303
320
  # Ensure directory exists
304
321
  save_path = make_fullpath(save_dir, make=True, enforce="directory")
305
- file_path = save_path / "class_map.json"
322
+ file_path = save_path / f"class_map{sanitized_suffix}.json"
306
323
 
307
324
  # Get unique values and sort them to ensure deterministic encoding (0, 1, 2...)
308
325
  # Convert to string to ensure the keys in JSON are strings
@@ -322,10 +339,9 @@ def encode_classification_target(
322
339
  json.dump(class_map, f, indent=4)
323
340
 
324
341
  if verbose >= 2:
325
- _LOGGER.info(f"Class mapping saved to: '{file_path}'")
326
-
342
+ _LOGGER.info(f"Target '{target_col}' encoded with {len(class_map)} classes. Saved to {file_path}.")
343
+
327
344
  if verbose >= 3:
328
- _LOGGER.info(f"Target '{target_col}' encoded with {len(class_map)} classes.")
329
345
  # Print a preview
330
346
  if len(class_map) <= 10:
331
347
  print(f" Mapping: {class_map}")
@@ -657,3 +673,66 @@ def reconstruct_multibinary(
657
673
  _LOGGER.info(f"Reconstructed {converted_count} binary columns matching '{pattern}'.")
658
674
 
659
675
  return new_df, target_columns
676
+
677
+
678
+ def filter_subset(
679
+ df: pd.DataFrame,
680
+ filters: Union[dict[str, Any], dict[str, list[Any]]],
681
+ drop_filter_cols: bool = True,
682
+ reset_index: bool = True,
683
+ verbose: int = 3
684
+ ) -> pd.DataFrame:
685
+ """
686
+ Filters the DataFrame based on a dictionary of column-value conditions.
687
+
688
+ Supports:
689
+ - Single value matching (e.g., {"Color": "Blue"})
690
+ - Multiple value matching (e.g., {"Color": ["Blue", "Red"]}) -> OR logic within column.
691
+ - Multiple column filtering (e.g., {"Color": "Blue", "Size": "Large"}) -> AND logic between columns.
692
+
693
+ Args:
694
+ df (pd.DataFrame): Input DataFrame.
695
+ filters (dict[str, Any] | dict[str, list[Any]]): Dictionary where keys are column names and values are the target values (scalar or list).
696
+ drop_filter_cols (bool): If True, drops the columns used for filtering from the result.
697
+ reset_index (bool): If True, resets the index of the resulting DataFrame.
698
+ verbose (int): Verbosity level.
699
+
700
+ Returns:
701
+ pd.DataFrame: The filtered DataFrame.
702
+ """
703
+ df_filtered = df.copy()
704
+
705
+ # Validate columns exist
706
+ missing_cols = [col for col in filters.keys() if col not in df.columns]
707
+ if missing_cols:
708
+ _LOGGER.error(f"Filter columns not found: {missing_cols}")
709
+ raise ValueError()
710
+
711
+ if verbose >= 2:
712
+ _LOGGER.info(f"Original shape: {df.shape}")
713
+
714
+ for col, value in filters.items():
715
+ # Handle list of values (OR logic within column)
716
+ if isinstance(value, list):
717
+ df_filtered = df_filtered[df_filtered[col].isin(value)]
718
+ # Handle single value
719
+ else:
720
+ # Warn if the value is a floating point due to potential precision issues
721
+ if isinstance(value, float) and verbose >= 1:
722
+ _LOGGER.warning(f"Filtering on column '{col}' with float value '{value}'.")
723
+ df_filtered = df_filtered[df_filtered[col] == value]
724
+
725
+ if drop_filter_cols:
726
+ if verbose >= 3:
727
+ _LOGGER.info(f"Dropping filter columns: {list(filters.keys())}")
728
+ df_filtered.drop(columns=list(filters.keys()), inplace=True)
729
+
730
+ if reset_index:
731
+ if verbose >= 3:
732
+ _LOGGER.info("Resetting index of the filtered DataFrame.")
733
+ df_filtered.reset_index(drop=True, inplace=True)
734
+
735
+ if verbose >= 2:
736
+ _LOGGER.info(f"Filtered shape: {df_filtered.shape}")
737
+
738
+ return df_filtered
@@ -475,6 +475,9 @@ def plot_correlation_heatmap(df: pd.DataFrame,
475
475
  save_path = make_fullpath(save_dir, make=True)
476
476
  # sanitize the plot title to save the file
477
477
  sanitized_plot_title = sanitize_filename(plot_title)
478
+ # prepend method to filename
479
+ sanitized_plot_title = f"{method}_{sanitized_plot_title}"
480
+
478
481
  plot_filename = sanitized_plot_title + ".svg"
479
482
 
480
483
  full_path = save_path / plot_filename