dragon-ml-toolbox 1.4.1__tar.gz → 1.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (24) hide show
  1. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/LICENSE-THIRD-PARTY.md +6 -1
  2. {dragon_ml_toolbox-1.4.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.3}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  4. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/MICE_imputation.py +22 -14
  5. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/data_exploration.py +41 -8
  6. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/ensemble_learning.py +446 -187
  7. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/particle_swarm_optimization.py +43 -52
  8. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/utilities.py +44 -8
  9. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/pyproject.toml +1 -1
  10. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/LICENSE +0 -0
  11. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/README.md +0 -0
  12. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  13. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  14. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  15. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  16. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/VIF_factor.py +0 -0
  17. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/__init__.py +0 -0
  18. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/datasetmaster.py +0 -0
  19. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/handle_excel.py +0 -0
  20. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/logger.py +0 -0
  21. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/pytorch_models.py +0 -0
  22. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/trainer.py +0 -0
  23. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/vision_helpers.py +0 -0
  24. {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/setup.cfg +0 -0
@@ -8,7 +8,12 @@ This project depends on the following third-party packages. Each is governed by
8
8
  - [seaborn](https://github.com/mwaskom/seaborn/blob/main/LICENSE)
9
9
  - [statsmodels](https://github.com/statsmodels/statsmodels/blob/main/LICENSE.txt)
10
10
  - [ipython](https://github.com/ipython/ipython/blob/main/COPYING.rst)
11
+ - [ipykernel](https://github.com/ipython/ipykernel/blob/main/COPYING.rst)
12
+ - [notebook](https://github.com/jupyter/notebook/blob/main/LICENSE)
13
+ - [jupyterlab](https://github.com/jupyterlab/jupyterlab/blob/main/LICENSE)
14
+ - [ipywidgets](https://github.com/jupyter-widgets/ipywidgets/blob/main/LICENSE)
11
15
  - [torch](https://github.com/pytorch/pytorch/blob/main/LICENSE)
16
+ - [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
12
17
  - [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
13
18
  - [imblearn](https://github.com/scikit-learn-contrib/imbalanced-learn/blob/main/LICENSE)
14
19
  - [Pillow](https://github.com/python-pillow/Pillow/blob/main/LICENSE)
@@ -19,5 +24,5 @@ This project depends on the following third-party packages. Each is governed by
19
24
  - [openpyxl](https://github.com/chronossc/openpyxl/blob/main/LICENSE)
20
25
  - [miceforest](https://github.com/AnotherSamWilson/miceforest/blob/main/LICENSE)
21
26
  - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
22
- - [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
27
+ - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE.txt)
23
28
  - [pyswarm](https://pythonhosted.org/pyswarm/#license)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.1
3
+ Version: 1.4.3
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.1
3
+ Version: 1.4.3
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -3,7 +3,7 @@ import miceforest as mf
3
3
  import os
4
4
  import matplotlib.pyplot as plt
5
5
  import numpy as np
6
- from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
6
+ from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
7
7
  from plotnine import ggplot, labs, theme, element_blank # type: ignore
8
8
 
9
9
 
@@ -49,15 +49,11 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
49
49
  return kernel, imputed_datasets, imputed_dataset_names
50
50
 
51
51
 
52
- def save_imputed_datasets(save_dir: str, imputed_datasets: list, imputed_dataset_names: list[str]):
53
- # Check path
54
- os.makedirs(save_dir, exist_ok=True)
55
-
52
+ def save_imputed_datasets(save_dir: str, imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
56
53
  for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
57
- output_path = os.path.join(save_dir, subname + ".csv")
58
- imputed_df.to_csv(output_path, index=False, encoding='utf-8')
59
- print(f"\tSaved {subname} with shape {imputed_df.shape}")
60
-
54
+ merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
55
+ save_dataframe(df=merged_df, save_dir=save_dir, filename=subname)
56
+
61
57
 
62
58
  #Get names of features that had missing values before imputation
63
59
  def get_na_column_names(df: pd.DataFrame):
@@ -119,7 +115,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
119
115
  plt.savefig(save_path, bbox_inches='tight', format="svg")
120
116
  plt.close()
121
117
 
122
- print(f"\t{dataset_file_dir} completed.")
118
+ print(f"{dataset_file_dir} completed.")
123
119
 
124
120
 
125
121
  # Imputed distributions
@@ -131,7 +127,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
131
127
  '''
132
128
  # Check path
133
129
  os.makedirs(root_dir, exist_ok=True)
134
- local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}_imputed")
130
+ local_dir_name = f"Distribution_Metrics_{df_name}_imputed"
131
+ local_save_dir = os.path.join(root_dir, local_dir_name)
135
132
  if not os.path.isdir(local_save_dir):
136
133
  os.makedirs(local_save_dir)
137
134
 
@@ -202,10 +199,10 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
202
199
  fig = kernel.plot_imputed_distributions(variables=[feature])
203
200
  _process_figure(fig, feature)
204
201
 
205
- print("\tImputed distributions saved successfully.")
202
+ print(f"{local_dir_name} completed.")
206
203
 
207
204
 
208
- def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
205
+ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
209
206
  """
210
207
  Call functions in sequence for each dataset in the provided path or directory:
211
208
  1. Load dataframe
@@ -213,6 +210,8 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
213
210
  3. Save imputed dataset(s)
214
211
  4. Save convergence metrics
215
212
  5. Save distribution metrics
213
+
214
+ Target columns must be skipped from the imputation.
216
215
  """
217
216
  # Check paths
218
217
  os.makedirs(save_datasets_dir, exist_ok=True)
@@ -228,9 +227,11 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
228
227
  for df_path in all_file_paths:
229
228
  df, df_name = load_dataframe(df_path=df_path)
230
229
 
230
+ df, df_targets = _skip_targets(df, target_columns)
231
+
231
232
  kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
232
233
 
233
- save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, imputed_dataset_names=imputed_dataset_names)
234
+ save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
234
235
 
235
236
  imputed_column_names = get_na_column_names(df=df)
236
237
 
@@ -239,5 +240,12 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
239
240
  get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
240
241
 
241
242
 
243
+ def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
244
+ valid_targets = [col for col in target_cols if col in df.columns]
245
+ df_targets = df[valid_targets]
246
+ df_feats = df.drop(columns=valid_targets)
247
+ return df_feats, df_targets
248
+
249
+
242
250
  def info():
243
251
  _script_info(__all__)
@@ -5,10 +5,8 @@ import seaborn as sns
5
5
  from IPython import get_ipython
6
6
  from IPython.display import clear_output
7
7
  import time
8
- from typing import Union, Literal, Dict, Tuple
8
+ from typing import Union, Literal, Dict, Tuple, Iterator
9
9
  import os
10
- import sys
11
- import textwrap
12
10
  from ml_tools.utilities import sanitize_filename, _script_info
13
11
 
14
12
 
@@ -24,7 +22,8 @@ __all__ = [
24
22
  "check_value_distributions",
25
23
  "plot_value_distributions",
26
24
  "clip_outliers_single",
27
- "clip_outliers_multi"
25
+ "clip_outliers_multi",
26
+ "distribute_datasets_by_target"
28
27
  ]
29
28
 
30
29
 
@@ -113,7 +112,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
113
112
  Parameters:
114
113
  df (pd.DataFrame): The input DataFrame.
115
114
  round_digits (int): Number of decimal places for the percentage.
116
-
115
+
117
116
  Returns:
118
117
  pd.DataFrame: A DataFrame summarizing missing values in each column.
119
118
  """
@@ -133,13 +132,14 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
133
132
  return null_summary
134
133
 
135
134
 
136
- def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
135
+ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
137
136
  """
138
137
  Drops columns with more than `threshold` fraction of missing values.
139
138
 
140
139
  Parameters:
141
140
  df (pd.DataFrame): The input DataFrame.
142
141
  threshold (float): Fraction of missing values above which columns are dropped.
142
+ show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
143
143
 
144
144
  Returns:
145
145
  pd.DataFrame: A new DataFrame without the dropped columns.
@@ -150,10 +150,15 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) ->
150
150
  if len(cols_to_drop) > 0:
151
151
  print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
152
152
  print(list(cols_to_drop))
153
+
154
+ result_df = df.drop(columns=cols_to_drop)
155
+ if show_nulls_after:
156
+ show_null_columns(df=result_df).head(20)
157
+
158
+ return result_df
153
159
  else:
154
160
  print(f"No columns have more than {threshold*100:.0f}% missing data.")
155
-
156
- return df.drop(columns=cols_to_drop)
161
+ return df
157
162
 
158
163
 
159
164
  def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
@@ -514,6 +519,34 @@ def clip_outliers_multi(
514
519
  return new_df
515
520
 
516
521
 
522
+ def distribute_datasets_by_target(
523
+ df: pd.DataFrame,
524
+ target_columns: list[str]
525
+ ) -> Iterator[Tuple[str, pd.DataFrame]]:
526
+ """
527
+ Yields cleaned DataFrames for each target column, where rows with missing
528
+ target values are removed. The target column is placed at the end.
529
+
530
+ Parameters
531
+ ----------
532
+ df : pd.DataFrame
533
+ Preprocessed dataframe with all feature and target columns ready to train.
534
+ target_columns : List[str]
535
+ List of target column names to generate per-target DataFrames.
536
+
537
+ Yields
538
+ ------
539
+ Tuple[str, pd.DataFrame]
540
+ * First element is the target column name.
541
+ * Second element is the corresponding cleaned DataFrame.
542
+ """
543
+ feature_columns = [col for col in df.columns if col not in target_columns]
544
+
545
+ for target in target_columns:
546
+ subset = df[feature_columns + [target]].dropna(subset=[target])
547
+ yield target, subset
548
+
549
+
517
550
  def _is_notebook():
518
551
  return get_ipython() is not None
519
552