dragon-ml-toolbox 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.3
3
+ Version: 1.4.5
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,19 @@
1
+ dragon_ml_toolbox-1.4.5.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-1.4.5.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=jDnniT0tgD0uw1NpjibsPF-qK3wmOKgTykLG2iNQU7E,1840
3
+ ml_tools/MICE_imputation.py,sha256=JMe9hyidJadFTHW7AHkNQ_fduTxH6CEh7_Ouy2LhCOQ,11096
4
+ ml_tools/VIF_factor.py,sha256=HEBsLJy_qSDaPw1Btha5B7omxN4wjJXg-sqoetCjCJw,10016
5
+ ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ ml_tools/data_exploration.py,sha256=vSjqKwtmPm1RHhHdC5AWH8Edg78nTM5SowUzWG2AxdY,18951
7
+ ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
8
+ ml_tools/ensemble_learning.py,sha256=xJyEbkFObm5YX6DmDW10FOUjSeYeBRhHLvncWZv_uTo,37319
9
+ ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
10
+ ml_tools/logger.py,sha256=ZTtUB9HTkNs5zHTdYRKNbKADjUkuObsF7s8U5pNnVRA,4716
11
+ ml_tools/particle_swarm_optimization.py,sha256=c5TG_MnCgxzkcREkqbJHqi6x_fovikUv2ePtKFL4HL8,20193
12
+ ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
13
+ ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
14
+ ml_tools/utilities.py,sha256=6369V_8VcDBPqUBOYS6VI6JKt0-wq_xVq9voTOU6VsQ,14515
15
+ ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
16
+ dragon_ml_toolbox-1.4.5.dist-info/METADATA,sha256=BDFkLly4Ylq2yPdQC3T4UVFr21fWjYlNgudq_bVWk-g,2516
17
+ dragon_ml_toolbox-1.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ dragon_ml_toolbox-1.4.5.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
+ dragon_ml_toolbox-1.4.5.dist-info/RECORD,,
@@ -3,8 +3,9 @@ import miceforest as mf
3
3
  import os
4
4
  import matplotlib.pyplot as plt
5
5
  import numpy as np
6
- from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
6
+ from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values
7
7
  from plotnine import ggplot, labs, theme, element_blank # type: ignore
8
+ from typing import Optional
8
9
 
9
10
 
10
11
  __all__ = [
@@ -17,7 +18,7 @@ __all__ = [
17
18
  ]
18
19
 
19
20
 
20
- def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
21
+ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str]]=None, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
21
22
 
22
23
  # Initialize kernel with number of imputed datasets to generate
23
24
  kernel = mf.ImputationKernel(
@@ -35,10 +36,20 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
35
36
  if imputed_datasets is None or len(imputed_datasets) == 0:
36
37
  raise ValueError("No imputed datasets were generated. Check the MICE process.")
37
38
 
39
+ # threshold binary columns
40
+ if binary_columns is not None:
41
+ invalid_binary_columns = set(binary_columns) - set(df.columns)
42
+ if invalid_binary_columns:
43
+ print(f"⚠️ These 'binary columns' are not in the dataset: {invalid_binary_columns}")
44
+ valid_binary_columns = [col for col in binary_columns if col not in invalid_binary_columns]
45
+ for imputed_df in imputed_datasets:
46
+ for binary_column_name in valid_binary_columns:
47
+ imputed_df[binary_column_name] = threshold_binary_values(imputed_df[binary_column_name]) # type: ignore
48
+
38
49
  if resulting_datasets == 1:
39
- imputed_dataset_names = [f"{df_name}_imputed"]
50
+ imputed_dataset_names = [f"{df_name}_MICE"]
40
51
  else:
41
- imputed_dataset_names = [f"{df_name}_imputed_{i+1}" for i in range(resulting_datasets)]
52
+ imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(resulting_datasets)]
42
53
 
43
54
  # Ensure indexes match
44
55
  for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
@@ -106,7 +117,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
106
117
  # Adjust plot display for the X axis
107
118
  _ticks = np.arange(iterations_cap)
108
119
  _labels = np.arange(1, iterations_cap + 1)
109
- plt.xticks(ticks=_ticks, labels=_labels)
120
+ plt.xticks(ticks=_ticks, labels=_labels) # type: ignore
110
121
  plt.grid(True)
111
122
 
112
123
  feature_save_name = sanitize_filename(feature_name)
@@ -202,7 +213,12 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
202
213
  print(f"{local_dir_name} completed.")
203
214
 
204
215
 
205
- def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
216
+ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
217
+ save_datasets_dir: str, save_metrics_dir: str,
218
+ binary_columns: Optional[list[str]]=None,
219
+ resulting_datasets: int=1,
220
+ iterations: int=20,
221
+ random_state: int=101):
206
222
  """
207
223
  Call functions in sequence for each dataset in the provided path or directory:
208
224
  1. Load dataframe
@@ -211,7 +227,7 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datas
211
227
  4. Save convergence metrics
212
228
  5. Save distribution metrics
213
229
 
214
- Target columns must be skipped from the imputation.
230
+ Target columns must be skipped from the imputation. Binary columns will be thresholded after imputation.
215
231
  """
216
232
  # Check paths
217
233
  os.makedirs(save_datasets_dir, exist_ok=True)
@@ -229,7 +245,7 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datas
229
245
 
230
246
  df, df_targets = _skip_targets(df, target_columns)
231
247
 
232
- kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
248
+ kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, binary_columns=binary_columns, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
233
249
 
234
250
  save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
235
251
 
ml_tools/VIF_factor.py CHANGED
@@ -26,6 +26,7 @@ def compute_vif(
26
26
  filename: Optional[str] = None,
27
27
  fontsize: int = 14,
28
28
  show_plot: bool = True,
29
+ verbose: bool = True
29
30
  ) -> pd.DataFrame:
30
31
  """
31
32
  Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
@@ -52,19 +53,20 @@ def compute_vif(
52
53
  if use_columns is None:
53
54
  sanitized_columns = df.select_dtypes(include='number').columns.tolist()
54
55
  missing_features = set(ground_truth_cols) - set(sanitized_columns)
55
- if missing_features:
56
+ if missing_features and verbose:
56
57
  print(f"⚠️ These columns are not Numeric:\n{missing_features}")
57
58
  else:
58
59
  sanitized_columns = list()
59
60
  for feature in use_columns:
60
61
  if feature not in ground_truth_cols:
61
- print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
62
+ if verbose:
63
+ print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
62
64
  else:
63
65
  sanitized_columns.append(feature)
64
66
 
65
67
  if ignore_columns is not None and use_columns is None:
66
68
  missing_ignore = set(ignore_columns) - set(ground_truth_cols)
67
- if missing_ignore:
69
+ if missing_ignore and verbose:
68
70
  print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
69
71
  sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
70
72
 
@@ -182,7 +184,7 @@ def compute_vif_multi(input_directory: str,
182
184
  max_features_to_plot: int = 20,
183
185
  fontsize: int = 14):
184
186
  """
185
- Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames).
187
+ Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots or warnings will be displayed inline.
186
188
  Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
187
189
 
188
190
  Args:
@@ -210,10 +212,11 @@ def compute_vif_multi(input_directory: str,
210
212
  fontsize=fontsize,
211
213
  save_dir=output_plot_directory,
212
214
  filename=df_name,
213
- show_plot=False)
215
+ show_plot=False,
216
+ verbose=False)
214
217
 
215
218
  if output_dataset_directory is not None:
216
- new_filename = 'VIF_' + df_name
219
+ new_filename = df_name + '_VIF'
217
220
  result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
218
221
 
219
222
  if len(dropped_cols) > 0:
@@ -22,8 +22,7 @@ __all__ = [
22
22
  "check_value_distributions",
23
23
  "plot_value_distributions",
24
24
  "clip_outliers_single",
25
- "clip_outliers_multi",
26
- "distribute_datasets_by_target"
25
+ "clip_outliers_multi"
27
26
  ]
28
27
 
29
28
 
@@ -90,18 +89,18 @@ def split_features_targets(df: pd.DataFrame, targets: list[str]):
90
89
 
91
90
  Returns:
92
91
  tuple: A tuple containing:
93
- - pd.DataFrame: Targets dataframe.
94
92
  - pd.DataFrame: Features dataframe.
93
+ - pd.DataFrame: Targets dataframe.
95
94
 
96
95
  Prints:
97
96
  - Shape of the original dataframe.
98
- - Shape of the targets dataframe.
99
97
  - Shape of the features dataframe.
98
+ - Shape of the targets dataframe.
100
99
  """
101
100
  df_targets = df[targets]
102
101
  df_features = df.drop(columns=targets)
103
- print(f"Original shape: {df.shape}\nTargets shape: {df_targets.shape}\nFeatures shape: {df_features.shape}")
104
- return df_targets, df_features
102
+ print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
103
+ return df_features, df_targets
105
104
 
106
105
 
107
106
  def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
@@ -153,7 +152,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
153
152
 
154
153
  result_df = df.drop(columns=cols_to_drop)
155
154
  if show_nulls_after:
156
- show_null_columns(df=result_df).head(20)
155
+ print(show_null_columns(df=result_df))
157
156
 
158
157
  return result_df
159
158
  else:
@@ -259,7 +258,7 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
259
258
  os.makedirs(save_dir, exist_ok=True)
260
259
  full_path = os.path.join(save_dir, plot_title + ".svg")
261
260
  plt.savefig(full_path, bbox_inches="tight", format='svg')
262
- print(f"Saved correlation heatmap to: {full_path}")
261
+ print(f"Saved correlation heatmap: '{plot_title}.svg'")
263
262
 
264
263
  plt.show()
265
264
  plt.close()
@@ -519,38 +518,9 @@ def clip_outliers_multi(
519
518
  return new_df
520
519
 
521
520
 
522
- def distribute_datasets_by_target(
523
- df: pd.DataFrame,
524
- target_columns: list[str]
525
- ) -> Iterator[Tuple[str, pd.DataFrame]]:
526
- """
527
- Yields cleaned DataFrames for each target column, where rows with missing
528
- target values are removed. The target column is placed at the end.
529
-
530
- Parameters
531
- ----------
532
- df : pd.DataFrame
533
- Preprocessed dataframe with all feature and target columns ready to train.
534
- target_columns : List[str]
535
- List of target column names to generate per-target DataFrames.
536
-
537
- Yields
538
- ------
539
- Tuple[str, pd.DataFrame]
540
- * First element is the target column name.
541
- * Second element is the corresponding cleaned DataFrame.
542
- """
543
- feature_columns = [col for col in df.columns if col not in target_columns]
544
-
545
- for target in target_columns:
546
- subset = df[feature_columns + [target]].dropna(subset=[target])
547
- yield target, subset
548
-
549
-
550
521
  def _is_notebook():
551
522
  return get_ipython() is not None
552
523
 
553
524
 
554
525
  def info():
555
526
  _script_info(__all__)
556
-
@@ -20,7 +20,7 @@ from sklearn.model_selection import train_test_split
20
20
  from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
21
21
  import shap
22
22
 
23
- from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
23
+ from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object
24
24
 
25
25
  import warnings # Ignore warnings
26
26
  warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -157,9 +157,7 @@ class RegressionTreeModels:
157
157
  self.gamma = gamma
158
158
 
159
159
  # LightGBM specific
160
- if num_leaves >= (2**max_depth):
161
- num_leaves = (2**max_depth) - 1
162
- print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
160
+ num_leaves = min(num_leaves, 2 ** (max_depth - 1))
163
161
  self.num_leaves = num_leaves
164
162
  self.min_data_in_leaf = min_data_in_leaf
165
163
 
@@ -202,7 +200,7 @@ class RegressionTreeModels:
202
200
  verbose=-1,
203
201
  reg_alpha=self.L1,
204
202
  reg_lambda=self.L2,
205
- boosting_type='dart',
203
+ boosting_type='gbdt',
206
204
  num_leaves=self.num_leaves,
207
205
  min_data_in_leaf=self.min_data_in_leaf
208
206
  )
@@ -321,9 +319,7 @@ class ClassificationTreeModels:
321
319
  self.gamma = gamma
322
320
 
323
321
  # LightGBM specific
324
- if num_leaves >= (2**max_depth):
325
- num_leaves = (2**max_depth) - 1
326
- print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
322
+ num_leaves = min(num_leaves, 2 ** (max_depth - 1))
327
323
  self.num_leaves = num_leaves
328
324
  self.min_data_in_leaf = min_data_in_leaf
329
325
 
@@ -370,7 +366,7 @@ class ClassificationTreeModels:
370
366
  verbose=-1,
371
367
  reg_alpha=self.L1,
372
368
  reg_lambda=self.L2,
373
- boosting_type='dart' if self.use_model_balance else 'goss',
369
+ boosting_type='gbdt' if self.use_model_balance else 'goss',
374
370
  num_leaves=self.num_leaves,
375
371
  min_data_in_leaf=self.min_data_in_leaf,
376
372
  class_weight='balanced' if self.use_model_balance else None
@@ -489,8 +485,9 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
489
485
  def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
490
486
  #Sanitize filenames to save
491
487
  sanitized_target_name = sanitize_filename(target_name)
492
- full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
493
- joblib.dump({'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}, full_path)
488
+ filename = f"{model_name}_{sanitized_target_name}"
489
+ to_save = {'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}
490
+ serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
494
491
 
495
492
  # function to evaluate the model and save metrics (Classification)
496
493
  def evaluate_model_classification(
ml_tools/logger.py CHANGED
@@ -55,7 +55,7 @@ def custom_logger(
55
55
  """
56
56
  try:
57
57
  os.makedirs(save_directory, exist_ok=True)
58
- timestamp = datetime.now().strftime(r"%Y%m%d_%H%M")
58
+ timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
59
59
  log_name = sanitize_filename(log_name)
60
60
  base_path = os.path.join(save_directory, f"{log_name}_{timestamp}")
61
61
 
@@ -80,7 +80,7 @@ def custom_logger(
80
80
  else:
81
81
  raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
82
82
 
83
- print(f"Log saved to: {base_path}")
83
+ print(f"Log saved to: '{base_path}'")
84
84
 
85
85
  except Exception as e:
86
86
  print(f"Error in custom_logger: {e}")
@@ -8,7 +8,7 @@ from sklearn.base import ClassifierMixin
8
8
  from typing import Literal, Union, Tuple, Dict, Optional
9
9
  import polars as pl
10
10
  from functools import partial
11
- from .utilities import sanitize_filename, _script_info, threshold_binary_values
11
+ from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object
12
12
 
13
13
 
14
14
  __all__ = [
@@ -38,7 +38,7 @@ class ObjectiveFunction():
38
38
  self.binary_features = binary_features
39
39
  self.is_hybrid = False if binary_features <= 0 else True
40
40
  self.use_noise = add_noise
41
- self._artifact = joblib.load(trained_model_path)
41
+ self._artifact = deserialize_object(trained_model_path, verbose=False, raise_on_error=True)
42
42
  self.model = self._get_from_artifact('model')
43
43
  self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
44
44
  self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
@@ -49,7 +49,7 @@ class ObjectiveFunction():
49
49
  if self.use_noise:
50
50
  features_array = self.add_noise(features_array)
51
51
  if self.is_hybrid:
52
- features_array = threshold_binary_values(input_array=features_array, binary_features=self.binary_features)
52
+ features_array = threshold_binary_values(input_array=features_array, binary_values=self.binary_features) # type: ignore
53
53
 
54
54
  if features_array.ndim == 1:
55
55
  features_array = features_array.reshape(1, -1)
@@ -83,6 +83,8 @@ class ObjectiveFunction():
83
83
  raise ValueError("Loaded model is None")
84
84
 
85
85
  def _get_from_artifact(self, key: str):
86
+ if self._artifact is None:
87
+ raise TypeError("Load model error")
86
88
  val = self._artifact.get(key)
87
89
  if key == "feature_names":
88
90
  result = val if isinstance(val, list) and val else None
@@ -129,10 +131,10 @@ def run_pso(lower_boundaries: list[float],
129
131
  target_name: Union[str, None]=None,
130
132
  feature_names: Union[list[str], None]=None,
131
133
  swarm_size: int=200,
132
- max_iterations: int=1000,
134
+ max_iterations: int=1500,
133
135
  inequality_constrain_function=None,
134
- post_hoc_analysis: Optional[int]=3,
135
- workers: int=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
136
+ post_hoc_analysis: Optional[int]=5,
137
+ workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
136
138
  """
137
139
  Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
138
140
 
ml_tools/utilities.py CHANGED
@@ -1,10 +1,13 @@
1
1
  import math
2
2
  import numpy as np
3
3
  import pandas as pd
4
+ import polars as pl
4
5
  import os
5
6
  from pathlib import Path
6
7
  import re
7
- from typing import Literal, Union, Sequence
8
+ from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
9
+ import joblib
10
+ from joblib.externals.loky.process_executor import TerminatedWorkerError
8
11
 
9
12
 
10
13
  # Keep track of available tools
@@ -16,7 +19,10 @@ __all__ = [
16
19
  "save_dataframe",
17
20
  "normalize_mixed_list",
18
21
  "sanitize_filename",
19
- "threshold_binary_values"
22
+ "threshold_binary_values",
23
+ "serialize_object",
24
+ "deserialize_object",
25
+ "distribute_datasets_by_target"
20
26
  ]
21
27
 
22
28
 
@@ -194,12 +200,9 @@ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
194
200
 
195
201
  Returns:
196
202
  List[float]: A list of normalized float values summing to 1.0.
197
- Values significantly smaller than the median scale are scaled up
198
- before normalization to correct likely input errors.
199
203
 
200
204
  Notes:
201
205
  - Zeros and None values remain zero.
202
- - If all input values are zero or None, the function returns a list of zeros.
203
206
  - Input strings are automatically cast to floats if possible.
204
207
 
205
208
  Example:
@@ -268,35 +271,156 @@ def sanitize_filename(filename: str) -> str:
268
271
 
269
272
 
270
273
  def threshold_binary_values(
271
- input_array: Union[Sequence[float], np.ndarray],
272
- binary_features: int
273
- ) -> np.ndarray:
274
+ input_array: Union[Sequence[float], np.ndarray, pd.Series, pl.Series],
275
+ binary_values: Optional[int] = None
276
+ ) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
274
277
  """
275
- Thresholds binary features in a 1D numeric sequence. Binary features must be located at the end of the sequence.
278
+ Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
276
279
 
277
- Converts binary elements to values (0 or 1) using a threshold of 0.5. The rest of the array (assumed to be continuous features) is returned unchanged.
280
+ Parameters:
281
+ input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
282
+ binary_values (Optional[int]) :
283
+ - If `None`, all values are treated as binary.
284
+ - If `int`, only this many last `binary_values` are thresholded.
285
+
286
+ Returns:
287
+ Same type as input, with binary elements binarized to 0 or 1 using a 0.5 threshold.
288
+ """
289
+ original_type = type(input_array)
290
+
291
+ if isinstance(input_array, pl.Series):
292
+ array = input_array.to_numpy()
293
+ elif isinstance(input_array, (pd.Series, np.ndarray)):
294
+ array = np.asarray(input_array)
295
+ elif isinstance(input_array, (list, tuple)):
296
+ array = np.array(input_array)
297
+ else:
298
+ raise TypeError("Unsupported input type")
299
+
300
+ array = array.flatten()
301
+ total = array.shape[0]
302
+
303
+ bin_count = total if binary_values is None else binary_values
304
+ if not (0 <= bin_count <= total):
305
+ raise ValueError("binary_values must be between 0 and the total number of elements")
306
+
307
+ if bin_count == 0:
308
+ result = array
309
+ else:
310
+ cont_part = array[:-bin_count] if bin_count < total else np.array([])
311
+ bin_part = (array[-bin_count:] > 0.5).astype(int)
312
+ result = np.concatenate([cont_part, bin_part])
313
+
314
+ if original_type is pd.Series:
315
+ return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
316
+ elif original_type is pl.Series:
317
+ return pl.Series(input_array.name if hasattr(input_array, 'name') else "binary", result) # type: ignore
318
+ elif original_type is list:
319
+ return result.tolist()
320
+ elif original_type is tuple:
321
+ return tuple(result)
322
+ else:
323
+ return result
324
+
325
+
326
+ def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
327
+ """
328
+ Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
278
329
 
279
330
  Parameters:
280
- input_array (Union[Sequence[float], np.ndarray]) : A one-dimensional collection of numeric values. The binary features must be located at the end of the array.
331
+ obj (Any) : The Python object to serialize.
332
+ save_dir (str) : Directory path where the serialized object will be saved.
333
+ filename (str) : Name for the output file, extension will be appended if needed.
334
+
335
+ Returns:
336
+ (str | None) : The full file path where the object was saved if successful; otherwise, None.
337
+ """
338
+ try:
339
+ os.makedirs(save_dir, exist_ok=True)
340
+ sanitized_name = sanitize_filename(filename)
341
+ if not sanitized_name.endswith('.joblib'):
342
+ sanitized_name = sanitized_name + ".joblib"
343
+ full_path = os.path.join(save_dir, sanitized_name)
344
+ joblib.dump(obj, full_path)
345
+ except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
346
+ message = f"❌ Failed to serialize object of type '{type(obj)}': {e}"
347
+ if raise_on_error:
348
+ raise Exception(message)
349
+ else:
350
+ print(message)
351
+ return None
352
+ else:
353
+ if verbose:
354
+ print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
355
+ return full_path
356
+
281
357
 
282
- binary_features (int) : Number of binary features to threshold from the end of the array. Must be between 0 and the total number of elements.
358
+ def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
359
+ """
360
+ Loads a serialized object from a .joblib file.
361
+
362
+ Parameters:
363
+ filepath (str): Full path to the serialized .joblib file.
283
364
 
284
365
  Returns:
285
- np.ndarray : A 1D NumPy array where the final `binary_features` values have been binarized.
366
+ (Any | None): The deserialized Python object, or None if loading fails.
286
367
  """
287
- array = np.asarray(input_array).flatten()
288
- total = array.shape[0]
289
-
290
- if binary_features < 0 or binary_features > total:
291
- raise ValueError("Binary features must be between 0 and the total number of features.")
292
-
293
- if binary_features == 0:
294
- return array
368
+ if not os.path.exists(filepath):
369
+ print(f"❌ File does not exist: {filepath}")
370
+ return None
371
+ try:
372
+ obj = joblib.load(filepath)
373
+ except (IOError, OSError, EOFError, TypeError, ValueError) as e:
374
+ message = f"❌ Failed to deserialize object from '{filepath}': {e}"
375
+ if raise_on_error:
376
+ raise Exception(message)
377
+ else:
378
+ print(message)
379
+ return None
380
+ else:
381
+ if verbose:
382
+ print(f"✅ Loaded object of type '{type(obj)}'")
383
+ return obj
295
384
 
296
- cont_part = array[:-binary_features]
297
- bin_part = (array[-binary_features:] > 0.5).astype(int)
385
+
386
+ def distribute_datasets_by_target(
387
+ df_or_path: Union[pd.DataFrame, str],
388
+ target_columns: list[str],
389
+ verbose: bool = False
390
+ ) -> Iterator[Tuple[str, pd.DataFrame]]:
391
+ """
392
+ Yields cleaned DataFrames for each target column, where rows with missing
393
+ target values are removed. The target column is placed at the end.
394
+
395
+ Parameters
396
+ ----------
397
+ df_or_path : [pd.DataFrame | str]
398
+ Dataframe or path to Dataframe with all feature and target columns ready to split and train a model.
399
+ target_columns : List[str]
400
+ List of target column names to generate per-target DataFrames.
401
+ verbose: bool
402
+ Whether to print info for each yielded dataset.
403
+
404
+ Yields
405
+ ------
406
+ Tuple[str, pd.DataFrame]
407
+ * First element is the target column name.
408
+ * Second element is the corresponding cleaned DataFrame.
409
+ """
410
+ # Validate path
411
+ if isinstance(df_or_path, str):
412
+ df, _ = load_dataframe(df_or_path)
413
+ else:
414
+ df = df_or_path
298
415
 
299
- return np.concatenate([cont_part, bin_part])
416
+ valid_targets = [col for col in df.columns if col in target_columns]
417
+ feature_columns = [col for col in df.columns if col not in valid_targets]
418
+
419
+ for target in valid_targets:
420
+ subset = df[feature_columns + [target]].dropna(subset=[target])
421
+ if verbose:
422
+ print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
423
+ yield target, subset
300
424
 
301
425
 
302
426
  def _script_info(all_data: list[str]):
@@ -1,19 +0,0 @@
1
- dragon_ml_toolbox-1.4.3.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-1.4.3.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=jDnniT0tgD0uw1NpjibsPF-qK3wmOKgTykLG2iNQU7E,1840
3
- ml_tools/MICE_imputation.py,sha256=3CN_Z5NnQnr9BQOBcccIV13BcV-zRSvWUpYXoMZpPt8,10142
4
- ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
5
- ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- ml_tools/data_exploration.py,sha256=iRMyn-H0ffjhLkL-B5zKSb1tlyT4bKm0H4vE_GMaXP0,19903
7
- ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
8
- ml_tools/ensemble_learning.py,sha256=5CCd8w0j-uDkf7ToN2ENT_KdZbB8ZQUFYlrKN-OHUxA,37533
9
- ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
10
- ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
11
- ml_tools/particle_swarm_optimization.py,sha256=g_KwPQL77HuVwceABP17RsF__qLmNAp2YVsXOxmFEOM,20034
12
- ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
13
- ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
14
- ml_tools/utilities.py,sha256=r7uEmo38Imly56BP3-Jv6dFJvLsbGipeBlkiZx2fcNQ,10189
15
- ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
16
- dragon_ml_toolbox-1.4.3.dist-info/METADATA,sha256=l0uOaYlimIH_YCT89C2mOkHaiKDEtq9XxhHpbcMCppU,2516
17
- dragon_ml_toolbox-1.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- dragon_ml_toolbox-1.4.3.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
- dragon_ml_toolbox-1.4.3.dist-info/RECORD,,