dragon-ml-toolbox 1.3.1__tar.gz → 1.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (23) hide show
  1. {dragon_ml_toolbox-1.3.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.3.2}/PKG-INFO +7 -3
  2. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2/dragon_ml_toolbox.egg-info}/PKG-INFO +7 -3
  3. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/dragon_ml_toolbox.egg-info/requires.txt +6 -2
  4. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/MICE_imputation.py +75 -28
  5. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/pyproject.toml +7 -3
  6. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/LICENSE +0 -0
  7. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/LICENSE-THIRD-PARTY.md +0 -0
  8. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/README.md +0 -0
  9. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  10. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  11. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  12. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/__init__.py +0 -0
  13. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/data_exploration.py +0 -0
  14. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/datasetmaster.py +0 -0
  15. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/ensemble_learning.py +0 -0
  16. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/handle_excel.py +0 -0
  17. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/logger.py +0 -0
  18. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/particle_swarm_optimization.py +0 -0
  19. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/pytorch_models.py +0 -0
  20. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/trainer.py +0 -0
  21. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/utilities.py +0 -0
  22. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/vision_helpers.py +0 -0
  23. {dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.3.1
3
+ Version: 1.3.2
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -15,7 +15,8 @@ License-File: LICENSE-THIRD-PARTY.md
15
15
  Requires-Dist: numpy<2.0
16
16
  Requires-Dist: scikit-learn
17
17
  Requires-Dist: openpyxl
18
- Requires-Dist: miceforest
18
+ Requires-Dist: miceforest<7.0.0,>=6.0.0
19
+ Requires-Dist: plotnine<0.13,>=0.12
19
20
  Requires-Dist: matplotlib
20
21
  Requires-Dist: seaborn
21
22
  Requires-Dist: pandas
@@ -23,9 +24,12 @@ Requires-Dist: polars
23
24
  Requires-Dist: imbalanced-learn
24
25
  Requires-Dist: statsmodels
25
26
  Requires-Dist: ipython
27
+ Requires-Dist: ipykernel
28
+ Requires-Dist: notebook
29
+ Requires-Dist: jupyterlab
26
30
  Requires-Dist: joblib
27
31
  Requires-Dist: xgboost
28
- Requires-Dist: lightgbm
32
+ Requires-Dist: lightgbm<=4.5.0
29
33
  Requires-Dist: shap
30
34
  Provides-Extra: pytorch
31
35
  Requires-Dist: torch; extra == "pytorch"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.3.1
3
+ Version: 1.3.2
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -15,7 +15,8 @@ License-File: LICENSE-THIRD-PARTY.md
15
15
  Requires-Dist: numpy<2.0
16
16
  Requires-Dist: scikit-learn
17
17
  Requires-Dist: openpyxl
18
- Requires-Dist: miceforest
18
+ Requires-Dist: miceforest<7.0.0,>=6.0.0
19
+ Requires-Dist: plotnine<0.13,>=0.12
19
20
  Requires-Dist: matplotlib
20
21
  Requires-Dist: seaborn
21
22
  Requires-Dist: pandas
@@ -23,9 +24,12 @@ Requires-Dist: polars
23
24
  Requires-Dist: imbalanced-learn
24
25
  Requires-Dist: statsmodels
25
26
  Requires-Dist: ipython
27
+ Requires-Dist: ipykernel
28
+ Requires-Dist: notebook
29
+ Requires-Dist: jupyterlab
26
30
  Requires-Dist: joblib
27
31
  Requires-Dist: xgboost
28
- Requires-Dist: lightgbm
32
+ Requires-Dist: lightgbm<=4.5.0
29
33
  Requires-Dist: shap
30
34
  Provides-Extra: pytorch
31
35
  Requires-Dist: torch; extra == "pytorch"
@@ -1,7 +1,8 @@
1
1
  numpy<2.0
2
2
  scikit-learn
3
3
  openpyxl
4
- miceforest
4
+ miceforest<7.0.0,>=6.0.0
5
+ plotnine<0.13,>=0.12
5
6
  matplotlib
6
7
  seaborn
7
8
  pandas
@@ -9,9 +10,12 @@ polars
9
10
  imbalanced-learn
10
11
  statsmodels
11
12
  ipython
13
+ ipykernel
14
+ notebook
15
+ jupyterlab
12
16
  joblib
13
17
  xgboost
14
- lightgbm
18
+ lightgbm<=4.5.0
15
19
  shap
16
20
 
17
21
  [pytorch]
@@ -3,15 +3,15 @@ import miceforest as mf
3
3
  import os
4
4
  import matplotlib.pyplot as plt
5
5
  import numpy as np
6
- from ml_tools.utilities import load_dataframe, list_csv_paths
7
-
6
+ from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename
7
+ from plotnine import ggplot, labs, theme, element_blank # type: ignore
8
8
 
9
9
  def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
10
10
 
11
11
  # Initialize kernel with number of imputed datasets to generate
12
12
  kernel = mf.ImputationKernel(
13
13
  data=df,
14
- datasets=resulting_datasets,
14
+ num_datasets=resulting_datasets,
15
15
  random_state=random_state
16
16
  )
17
17
 
@@ -21,6 +21,9 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
21
21
  # Retrieve the imputed datasets
22
22
  imputed_datasets = [kernel.complete_data(dataset=i) for i in range(resulting_datasets)]
23
23
 
24
+ if imputed_datasets is None or len(imputed_datasets) == 0:
25
+ raise ValueError("No imputed datasets were generated. Check the MICE process.")
26
+
24
27
  if resulting_datasets == 1:
25
28
  imputed_dataset_names = [f"{df_name}_imputed"]
26
29
  else:
@@ -28,8 +31,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
28
31
 
29
32
  # Ensure indexes match
30
33
  for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
31
- assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}"
32
- assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}"
34
+ assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}" # type: ignore
35
+ assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}" # type: ignore
33
36
  # print("✅ All imputed datasets match the original DataFrame indexes.")
34
37
 
35
38
  return kernel, imputed_datasets, imputed_dataset_names
@@ -51,46 +54,65 @@ def get_na_column_names(df: pd.DataFrame):
51
54
 
52
55
 
53
56
  #Convergence diagnostic
54
- def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str):
57
+ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str, fontsize: int=16):
58
+ """
59
+ Generate and save convergence diagnostic plots for imputed variables.
60
+
61
+ Parameters:
62
+ - kernel: Trained miceforest.ImputationKernel.
63
+ - imputed_dataset_names: Names assigned to each imputed dataset.
64
+ - column_names: List of feature names to track over iterations.
65
+ - root_dir: Directory to save convergence plots.
66
+ """
55
67
  # get number of iterations used
56
68
  iterations_cap = kernel.iteration_count()
69
+ dataset_count = kernel.num_datasets
70
+
71
+ if dataset_count != len(imputed_dataset_names):
72
+ raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
57
73
 
58
74
  # Check path
59
75
  os.makedirs(root_dir, exist_ok=True)
60
76
 
77
+ # Styling parameters
78
+ label_font = {'size': fontsize, 'weight': 'bold'}
79
+
61
80
  # iterate over each imputed dataset
62
- for dataset_id, imputed_dataset_name in zip(range(kernel.num_datasets), imputed_dataset_names):
81
+ for dataset_id, imputed_dataset_name in zip(range(dataset_count), imputed_dataset_names):
63
82
  #Check directory for current dataset
64
83
  dataset_file_dir = f"Convergence_Metrics_{imputed_dataset_name}"
65
84
  local_save_dir = os.path.join(root_dir, dataset_file_dir)
66
- if not os.path.isdir(local_save_dir):
67
- os.makedirs(local_save_dir)
85
+ os.makedirs(local_save_dir, exist_ok=True)
68
86
 
69
87
  for feature_name in column_names:
70
88
  means_per_iteration = []
71
89
  for iteration in range(iterations_cap):
72
90
  current_imputed = kernel.complete_data(dataset=dataset_id, iteration=iteration)
73
- means_per_iteration.append(np.mean(current_imputed[feature_name]))
74
-
91
+ means_per_iteration.append(np.mean(current_imputed[feature_name])) # type: ignore
92
+
93
+ plt.figure(figsize=(10, 8))
75
94
  plt.plot(means_per_iteration, marker='o')
76
- plt.xlabel("Iteration")
77
- plt.ylabel("Mean of Imputed Values")
78
- plt.title(f"Mean Convergence for '{feature_name}'")
95
+ plt.xlabel("Iteration", **label_font)
96
+ plt.ylabel("Mean of Imputed Values", **label_font)
97
+ plt.title(f"Mean Convergence for '{feature_name}'", **label_font)
79
98
 
80
99
  # Adjust plot display for the X axis
81
100
  _ticks = np.arange(iterations_cap)
82
101
  _labels = np.arange(1, iterations_cap + 1)
83
102
  plt.xticks(ticks=_ticks, labels=_labels)
103
+ plt.grid(True)
84
104
 
85
- save_path = os.path.join(local_save_dir, feature_name + ".svg")
105
+ feature_save_name = sanitize_filename(feature_name)
106
+
107
+ save_path = os.path.join(local_save_dir, feature_save_name + ".svg")
86
108
  plt.savefig(save_path, bbox_inches='tight', format="svg")
87
109
  plt.close()
88
110
 
89
- print(f"{dataset_file_dir} completed.")
111
+ print(f"\t{dataset_file_dir} completed.")
90
112
 
91
113
 
92
114
  # Imputed distributions
93
- def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=18):
115
+ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=14):
94
116
  '''
95
117
  It works using miceforest's authors implementation of the method `.plot_imputed_distributions()`.
96
118
 
@@ -106,12 +128,35 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
106
128
  legend_kwargs = {'frameon': True, 'facecolor': 'white', 'framealpha': 0.8}
107
129
  label_font = {'size': fontsize, 'weight': 'bold'}
108
130
 
109
- def _process_figure(fig, filename):
131
+ def _process_figure(fig, filename: str):
110
132
  """Helper function to add labels and legends to a figure"""
111
- for ax in fig.axes:
133
+
134
+ if not isinstance(fig, ggplot):
135
+ raise TypeError("Expected a plotnine.ggplot object")
136
+
137
+ # Edit labels and title
138
+ fig = fig + theme(
139
+ plot_title=element_blank(), # removes labs(title=...)
140
+ strip_text=element_blank() # removes facet_wrap labels
141
+ )
142
+
143
+ fig = fig + labs(y="", x="")
144
+
145
+ # Render to matplotlib figure
146
+ fig = fig.draw()
147
+
148
+ if not hasattr(fig, 'axes') or len(fig.axes) == 0:
149
+ raise RuntimeError("Rendered figure has no axes to modify")
150
+
151
+ if filename == "Combined_Distributions":
152
+ custom_xlabel = "Feature Values"
153
+ else:
154
+ custom_xlabel = filename
155
+
156
+ for ax in fig.axes:
112
157
  # Set axis labels
113
- ax.set_xlabel('Value', **label_font)
114
- ax.set_ylabel('Density', **label_font)
158
+ ax.set_xlabel(custom_xlabel, **label_font)
159
+ ax.set_ylabel('Distribution', **label_font)
115
160
 
116
161
  # Add legend based on line colors
117
162
  lines = ax.get_lines()
@@ -122,26 +167,28 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
122
167
  ax.legend(**legend_kwargs)
123
168
 
124
169
  # Adjust layout and save
125
- fig.tight_layout()
170
+ # fig.tight_layout()
171
+ # fig.subplots_adjust(bottom=0.2, left=0.2) # Optional, depending on overflow
126
172
  fig.savefig(
127
- os.path.join(local_save_dir, filename),
173
+ os.path.join(local_save_dir, filename + ".svg"),
128
174
  format='svg',
129
175
  bbox_inches='tight',
130
- pad_inches=0
176
+ pad_inches=0.1
131
177
  )
132
178
  plt.close(fig)
133
-
179
+
134
180
  if one_plot:
135
181
  # Generate combined plot
136
182
  fig = kernel.plot_imputed_distributions(variables=column_names)
137
- _process_figure(fig, "Combined_Distributions.svg")
183
+ _process_figure(fig, "Combined_Distributions")
138
184
  # Generate individual plots per feature
139
185
  else:
140
186
  for feature in column_names:
141
187
  fig = kernel.plot_imputed_distributions(variables=[feature])
142
- _process_figure(fig, f"{feature}.svg")
188
+ feature_save_name = sanitize_filename(feature)
189
+ _process_figure(fig, feature_save_name)
143
190
 
144
- print("Imputed distributions saved successfully.")
191
+ print("\tImputed distributions saved successfully.")
145
192
 
146
193
 
147
194
  def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "1.3.1"
3
+ version = "1.3.2"
4
4
  description = "A collection of tools for data science and machine learning projects"
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }
@@ -16,7 +16,8 @@ dependencies = [
16
16
  "numpy<2.0",
17
17
  "scikit-learn",
18
18
  "openpyxl",
19
- "miceforest",
19
+ "miceforest>=6.0.0,<7.0.0",
20
+ "plotnine>=0.12,<0.13",
20
21
  "matplotlib",
21
22
  "seaborn",
22
23
  "pandas",
@@ -24,9 +25,12 @@ dependencies = [
24
25
  "imbalanced-learn",
25
26
  "statsmodels",
26
27
  "ipython",
28
+ "ipykernel",
29
+ "notebook",
30
+ "jupyterlab",
27
31
  "joblib",
28
32
  "xgboost",
29
- "lightgbm",
33
+ "lightgbm<=4.5.0",
30
34
  "shap"
31
35
  ]
32
36