dragon-ml-toolbox 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.0
3
+ Version: 1.4.2
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -80,7 +80,7 @@ Clone the repository and install in editable mode with optional dependencies:
80
80
  ```bash
81
81
  git clone https://github.com/DrAg0n-BoRn/ML_tools.git
82
82
  cd ML_tools
83
- pip install -e '.[pytorch]'
83
+ pip install -e .
84
84
  ```
85
85
 
86
86
  ## Usage
@@ -91,3 +91,19 @@ After installation, import modules like this:
91
91
  from ml_tools.utilities import sanitize_filename
92
92
  from ml_tools.logger import custom_logger
93
93
  ```
94
+
95
+ ## Available modules
96
+
97
+ ```bash
98
+ data_exploration
99
+ datasetmaster
100
+ ensemble_learning
101
+ handle_excel
102
+ logger
103
+ MICE_imputation
104
+ particle_swarm_optimization
105
+ trainer
106
+ utilities
107
+ VIF_factor
108
+ vision_helpers
109
+ ```
@@ -0,0 +1,19 @@
1
+ dragon_ml_toolbox-1.4.2.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-1.4.2.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
3
+ ml_tools/MICE_imputation.py,sha256=CK0tYZ_kQkdETohOlhI7RP7oFkJTXrP-XtIxb--dzpU,9726
4
+ ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
5
+ ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ ml_tools/data_exploration.py,sha256=FXP5i6bQo8J3RCyLRmlX-qJVh4VH8DbMjrdUmyd1mF0,18708
7
+ ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
8
+ ml_tools/ensemble_learning.py,sha256=p8t5PI63N3G0ZgvOKmvFOvwJ24qqPdZCvyiDAx4ggXY,27670
9
+ ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
10
+ ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
11
+ ml_tools/particle_swarm_optimization.py,sha256=3xsc6sg-5o3cPbG_dWUyF3HdRVxgL4k_kRuPMU11NnM,20020
12
+ ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
13
+ ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
14
+ ml_tools/utilities.py,sha256=Pou-8IZsZj9NiZ_shhLt552yaKNvbnQ1Ztoj6VMHIeE,10091
15
+ ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
16
+ dragon_ml_toolbox-1.4.2.dist-info/METADATA,sha256=c95w_AETVdAwMYWrowJKxkC0wYCsgRrTmxyekPz7WBE,2516
17
+ dragon_ml_toolbox-1.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ dragon_ml_toolbox-1.4.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
+ dragon_ml_toolbox-1.4.2.dist-info/RECORD,,
@@ -3,9 +3,20 @@ import miceforest as mf
3
3
  import os
4
4
  import matplotlib.pyplot as plt
5
5
  import numpy as np
6
- from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename
6
+ from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
7
7
  from plotnine import ggplot, labs, theme, element_blank # type: ignore
8
8
 
9
+
10
+ __all__ = [
11
+ "apply_mice",
12
+ "save_imputed_datasets",
13
+ "get_na_column_names",
14
+ "get_convergence_diagnostic",
15
+ "get_imputed_distributions",
16
+ "run_mice_pipeline"
17
+ ]
18
+
19
+
9
20
  def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
10
21
 
11
22
  # Initialize kernel with number of imputed datasets to generate
@@ -210,7 +221,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
210
221
  if os.path.isfile(df_path_or_dir):
211
222
  all_file_paths = [df_path_or_dir]
212
223
  elif os.path.isdir(df_path_or_dir):
213
- all_file_paths = list_csv_paths(df_path_or_dir).values()
224
+ all_file_paths = list(list_csv_paths(df_path_or_dir).values())
214
225
  else:
215
226
  raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
216
227
 
@@ -226,3 +237,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
226
237
  get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
227
238
 
228
239
  get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
240
+
241
+
242
+ def info():
243
+ _script_info(__all__)
ml_tools/VIF_factor.py CHANGED
@@ -7,12 +7,19 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
7
7
  from statsmodels.tools.tools import add_constant
8
8
  import warnings
9
9
  import os
10
- from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe
10
+ from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
11
+
12
+
13
+ __all__ = [
14
+ "compute_vif",
15
+ "drop_vif_based",
16
+ "compute_vif_multi"
17
+ ]
11
18
 
12
19
 
13
20
  def compute_vif(
14
21
  df: pd.DataFrame,
15
- target_columns: Optional[list[str]] = None,
22
+ use_columns: Optional[list[str]] = None,
16
23
  ignore_columns: Optional[list[str]] = None,
17
24
  max_features_to_plot: int = 20,
18
25
  save_dir: Optional[str] = None,
@@ -25,7 +32,7 @@ def compute_vif(
25
32
 
26
33
  Args:
27
34
  df (pd.DataFrame): The input DataFrame.
28
- target_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
35
+ use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
29
36
  ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
30
37
  max_features_to_plot (int): Adjust the number of features shown in the plot.
31
38
  save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
@@ -42,20 +49,20 @@ def compute_vif(
42
49
  A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
43
50
  """
44
51
  ground_truth_cols = df.columns.to_list()
45
- if target_columns is None:
52
+ if use_columns is None:
46
53
  sanitized_columns = df.select_dtypes(include='number').columns.tolist()
47
54
  missing_features = set(ground_truth_cols) - set(sanitized_columns)
48
55
  if missing_features:
49
56
  print(f"⚠️ These columns are not Numeric:\n{missing_features}")
50
57
  else:
51
58
  sanitized_columns = list()
52
- for feature in target_columns:
59
+ for feature in use_columns:
53
60
  if feature not in ground_truth_cols:
54
61
  print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
55
62
  else:
56
63
  sanitized_columns.append(feature)
57
64
 
58
- if ignore_columns is not None and target_columns is None:
65
+ if ignore_columns is not None and use_columns is None:
59
66
  missing_ignore = set(ignore_columns) - set(ground_truth_cols)
60
67
  if missing_ignore:
61
68
  print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
@@ -137,7 +144,7 @@ def compute_vif(
137
144
  return vif_data.drop(columns="color")
138
145
 
139
146
 
140
- def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> pd.DataFrame:
147
+ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> tuple[pd.DataFrame, list[str]]:
141
148
  """
142
149
  Drops columns from the original DataFrame based on their VIF values exceeding a given threshold.
143
150
 
@@ -147,7 +154,9 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
147
154
  threshold (float): VIF threshold above which columns will be dropped.
148
155
 
149
156
  Returns:
150
- pd.DataFrame: A new DataFrame with high-VIF columns removed.
157
+ (tuple[pd.DataFrame, list[str]]):
158
+ - A new DataFrame with high-VIF columns removed.
159
+ - A list with dropped column names.
151
160
  """
152
161
  # Ensure expected structure
153
162
  if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
@@ -162,13 +171,13 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
162
171
  if result_df.empty:
163
172
  print(f"\t⚠️ Warning: All columns were dropped.")
164
173
 
165
- return result_df
174
+ return result_df, to_drop
166
175
 
167
176
 
168
177
  def compute_vif_multi(input_directory: str,
169
178
  output_plot_directory: str,
170
179
  output_dataset_directory: Optional[str] = None,
171
- target_columns: Optional[list[str]] = None,
180
+ use_columns: Optional[list[str]] = None,
172
181
  ignore_columns: Optional[list[str]] = None,
173
182
  max_features_to_plot: int = 20,
174
183
  fontsize: int = 14):
@@ -180,7 +189,7 @@ def compute_vif_multi(input_directory: str,
180
189
  input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
181
190
  output_plot_directory (str): Save plots to this directory.
182
191
  output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
183
- target_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
192
+ use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
184
193
  ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
185
194
  max_features_to_plot (int): Adjust the number of features shown in the plot.
186
195
  fontsize (int): Base fontsize to scale title and labels on hte plot.
@@ -195,7 +204,7 @@ def compute_vif_multi(input_directory: str,
195
204
 
196
205
  for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
197
206
  vif_dataframe = compute_vif(df=df,
198
- target_columns=target_columns,
207
+ use_columns=use_columns,
199
208
  ignore_columns=ignore_columns,
200
209
  max_features_to_plot=max_features_to_plot,
201
210
  fontsize=fontsize,
@@ -205,5 +214,11 @@ def compute_vif_multi(input_directory: str,
205
214
 
206
215
  if output_dataset_directory is not None:
207
216
  new_filename = 'VIF_' + df_name
208
- result_df = drop_vif_based(df=df, vif_df=vif_dataframe)
209
- save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
217
+ result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
218
+
219
+ if len(dropped_cols) > 0:
220
+ save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
221
+
222
+
223
+ def info():
224
+ _script_info(__all__)
@@ -9,22 +9,23 @@ from typing import Union, Literal, Dict, Tuple
9
9
  import os
10
10
  import sys
11
11
  import textwrap
12
- from ml_tools.utilities import sanitize_filename
12
+ from ml_tools.utilities import sanitize_filename, _script_info
13
13
 
14
14
 
15
- # Keep track of all available functions, show using `info()`
16
- __all__ = ["summarize_dataframe",
17
- "drop_rows_with_missing_data",
18
- "split_features_targets",
19
- "show_null_columns",
20
- "drop_columns_with_missing_data",
21
- "split_continuous_binary",
22
- "plot_correlation_heatmap",
23
- "check_value_distributions",
24
- "plot_value_distributions",
25
- "clip_outliers_single",
26
- "clip_outliers_multi",
27
- "merge_dataframes"]
15
+ # Keep track of all available tools, show using `info()`
16
+ __all__ = [
17
+ "summarize_dataframe",
18
+ "drop_rows_with_missing_data",
19
+ "split_features_targets",
20
+ "show_null_columns",
21
+ "drop_columns_with_missing_data",
22
+ "split_continuous_binary",
23
+ "plot_correlation_heatmap",
24
+ "check_value_distributions",
25
+ "plot_value_distributions",
26
+ "clip_outliers_single",
27
+ "clip_outliers_multi"
28
+ ]
28
29
 
29
30
 
30
31
  def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
@@ -58,34 +59,6 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
58
59
  return summary
59
60
 
60
61
 
61
- def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
62
- """
63
- Displays a table of columns with missing values, showing both the count and
64
- percentage of missing entries per column.
65
-
66
- Parameters:
67
- df (pd.DataFrame): The input DataFrame.
68
- round_digits (int): Number of decimal places for the percentage.
69
-
70
- Returns:
71
- pd.DataFrame: A DataFrame summarizing missing values in each column.
72
- """
73
- null_counts = df.isnull().sum()
74
- null_percent = df.isnull().mean() * 100
75
-
76
- # Filter only columns with at least one null
77
- mask = null_counts > 0
78
- null_summary = pd.DataFrame({
79
- 'Missing Count': null_counts[mask],
80
- 'Missing %': null_percent[mask].round(round_digits)
81
- })
82
-
83
- # Sort by descending percentage of missing values
84
- null_summary = null_summary.sort_values(by='Missing %', ascending=False)
85
- # print(null_summary)
86
- return null_summary
87
-
88
-
89
62
  def drop_rows_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
90
63
  """
91
64
  Drops rows with more than `threshold` fraction of missing values.
@@ -132,6 +105,57 @@ def split_features_targets(df: pd.DataFrame, targets: list[str]):
132
105
  return df_targets, df_features
133
106
 
134
107
 
108
+ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
109
+ """
110
+ Displays a table of columns with missing values, showing both the count and
111
+ percentage of missing entries per column.
112
+
113
+ Parameters:
114
+ df (pd.DataFrame): The input DataFrame.
115
+ round_digits (int): Number of decimal places for the percentage.
116
+
117
+ Returns:
118
+ pd.DataFrame: A DataFrame summarizing missing values in each column.
119
+ """
120
+ null_counts = df.isnull().sum()
121
+ null_percent = df.isnull().mean() * 100
122
+
123
+ # Filter only columns with at least one null
124
+ mask = null_counts > 0
125
+ null_summary = pd.DataFrame({
126
+ 'Missing Count': null_counts[mask],
127
+ 'Missing %': null_percent[mask].round(round_digits)
128
+ })
129
+
130
+ # Sort by descending percentage of missing values
131
+ null_summary = null_summary.sort_values(by='Missing %', ascending=False)
132
+ # print(null_summary)
133
+ return null_summary
134
+
135
+
136
+ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
137
+ """
138
+ Drops columns with more than `threshold` fraction of missing values.
139
+
140
+ Parameters:
141
+ df (pd.DataFrame): The input DataFrame.
142
+ threshold (float): Fraction of missing values above which columns are dropped.
143
+
144
+ Returns:
145
+ pd.DataFrame: A new DataFrame without the dropped columns.
146
+ """
147
+ missing_fraction = df.isnull().mean()
148
+ cols_to_drop = missing_fraction[missing_fraction > threshold].index
149
+
150
+ if len(cols_to_drop) > 0:
151
+ print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
152
+ print(list(cols_to_drop))
153
+ else:
154
+ print(f"No columns have more than {threshold*100:.0f}% missing data.")
155
+
156
+ return df.drop(columns=cols_to_drop)
157
+
158
+
135
159
  def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
136
160
  """
137
161
  Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
@@ -174,29 +198,6 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
174
198
 
175
199
  return df_cont, df_bin # type: ignore
176
200
 
177
-
178
- def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
179
- """
180
- Drops columns with more than `threshold` fraction of missing values.
181
-
182
- Parameters:
183
- df (pd.DataFrame): The input DataFrame.
184
- threshold (float): Fraction of missing values above which columns are dropped.
185
-
186
- Returns:
187
- pd.DataFrame: A new DataFrame without the dropped columns.
188
- """
189
- missing_fraction = df.isnull().mean()
190
- cols_to_drop = missing_fraction[missing_fraction > threshold].index
191
-
192
- if len(cols_to_drop) > 0:
193
- print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
194
- print(list(cols_to_drop))
195
- else:
196
- print(f"No columns have more than {threshold*100:.0f}% missing data.")
197
-
198
- return df.drop(columns=cols_to_drop)
199
-
200
201
 
201
202
  def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None, method: Literal["pearson", "kendall", "spearman"]="pearson", plot_title: str="Correlation Heatmap"):
202
203
  """
@@ -513,83 +514,10 @@ def clip_outliers_multi(
513
514
  return new_df
514
515
 
515
516
 
516
- def merge_dataframes(
517
- *dfs: pd.DataFrame,
518
- reset_index: bool = False,
519
- direction: Literal["horizontal", "vertical"] = "horizontal"
520
- ) -> pd.DataFrame:
521
- """
522
- Merges multiple DataFrames either horizontally or vertically.
523
-
524
- Parameters:
525
- *dfs (pd.DataFrame): Variable number of DataFrames to merge.
526
- reset_index (bool): Whether to reset index in the final merged DataFrame.
527
- direction (["horizontal" | "vertical"]):
528
- - "horizontal": Merge on index, adding columns.
529
- - "vertical": Append rows; all DataFrames must have identical columns.
530
-
531
- Returns:
532
- pd.DataFrame: A single merged DataFrame.
533
-
534
- Raises:
535
- ValueError:
536
- - If fewer than 2 DataFrames are provided.
537
- - If indexes do not match for horizontal merge.
538
- - If column names or order differ for vertical merge.
539
- """
540
- if len(dfs) < 2:
541
- raise ValueError("At least 2 DataFrames must be provided.")
542
-
543
- for i, df in enumerate(dfs, start=1):
544
- print(f"DataFrame {i} shape: {df.shape}")
545
-
546
-
547
- if direction == "horizontal":
548
- reference_index = dfs[0].index
549
- for i, df in enumerate(dfs, start=1):
550
- if not df.index.equals(reference_index):
551
- raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
552
- merged_df = pd.concat(dfs, axis=1)
553
-
554
- elif direction == "vertical":
555
- reference_columns = dfs[0].columns
556
- for i, df in enumerate(dfs, start=1):
557
- if not df.columns.equals(reference_columns):
558
- raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
559
- merged_df = pd.concat(dfs, axis=0)
560
-
561
- else:
562
- raise ValueError(f"Invalid merge direction: {direction}")
563
-
564
- if reset_index:
565
- merged_df = merged_df.reset_index(drop=True)
566
-
567
- print(f"Merged DataFrame shape: {merged_df.shape}")
568
-
569
- return merged_df
570
-
571
-
572
517
  def _is_notebook():
573
518
  return get_ipython() is not None
574
519
 
575
520
 
576
- def info(full_info: bool=True):
577
- """
578
- List available functions and their descriptions.
579
- """
580
- print("Available functions for data exploration:")
581
- if full_info:
582
- module = sys.modules[__name__]
583
- for name in __all__:
584
- obj = getattr(module, name, None)
585
- if callable(obj):
586
- doc = obj.__doc__ or "No docstring provided."
587
- formatted_doc = textwrap.indent(textwrap.dedent(doc.strip()), prefix=" ")
588
- print(f"\n{name}:\n{formatted_doc}")
589
- else:
590
- for i, name in enumerate(__all__, start=1):
591
- print(f"{i} - {name}")
592
-
521
+ def info():
522
+ _script_info(__all__)
593
523
 
594
- if __name__ == "__main__":
595
- info()
ml_tools/datasetmaster.py CHANGED
@@ -11,6 +11,15 @@ from PIL import Image
11
11
  from torchvision.datasets import ImageFolder
12
12
  from torchvision import transforms
13
13
  import matplotlib.pyplot as plt
14
+ from .utilities import _script_info
15
+
16
+
17
+ __all__ = [
18
+ "DatasetMaker",
19
+ "PytorchDataset",
20
+ "make_vision_dataset",
21
+ "SequenceDataset",
22
+ ]
14
23
 
15
24
 
16
25
  class DatasetMaker():
@@ -592,4 +601,7 @@ class SequenceDataset():
592
601
 
593
602
  def __len__(self):
594
603
  return f"Train: {len(self.train_dataset)}, Test: {len(self.test_dataset)}"
595
-
604
+
605
+
606
+ def info():
607
+ _script_info(__all__)