dragon-ml-toolbox 1.4.3__tar.gz → 1.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.4.3/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.4}/PKG-INFO +1 -1
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/MICE_imputation.py +2 -2
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/VIF_factor.py +9 -6
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/data_exploration.py +11 -5
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/ensemble_learning.py +4 -8
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/logger.py +2 -2
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/particle_swarm_optimization.py +3 -3
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/pyproject.toml +1 -1
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/README.md +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/datasetmaster.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/pytorch_models.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/trainer.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/ml_tools/vision_helpers.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/setup.cfg +0 -0
|
@@ -36,9 +36,9 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
|
|
|
36
36
|
raise ValueError("No imputed datasets were generated. Check the MICE process.")
|
|
37
37
|
|
|
38
38
|
if resulting_datasets == 1:
|
|
39
|
-
imputed_dataset_names = [f"{df_name}
|
|
39
|
+
imputed_dataset_names = [f"{df_name}_MICE"]
|
|
40
40
|
else:
|
|
41
|
-
imputed_dataset_names = [f"{df_name}
|
|
41
|
+
imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(resulting_datasets)]
|
|
42
42
|
|
|
43
43
|
# Ensure indexes match
|
|
44
44
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
@@ -26,6 +26,7 @@ def compute_vif(
|
|
|
26
26
|
filename: Optional[str] = None,
|
|
27
27
|
fontsize: int = 14,
|
|
28
28
|
show_plot: bool = True,
|
|
29
|
+
verbose: bool = True
|
|
29
30
|
) -> pd.DataFrame:
|
|
30
31
|
"""
|
|
31
32
|
Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
|
|
@@ -52,19 +53,20 @@ def compute_vif(
|
|
|
52
53
|
if use_columns is None:
|
|
53
54
|
sanitized_columns = df.select_dtypes(include='number').columns.tolist()
|
|
54
55
|
missing_features = set(ground_truth_cols) - set(sanitized_columns)
|
|
55
|
-
if missing_features:
|
|
56
|
+
if missing_features and verbose:
|
|
56
57
|
print(f"⚠️ These columns are not Numeric:\n{missing_features}")
|
|
57
58
|
else:
|
|
58
59
|
sanitized_columns = list()
|
|
59
60
|
for feature in use_columns:
|
|
60
61
|
if feature not in ground_truth_cols:
|
|
61
|
-
|
|
62
|
+
if verbose:
|
|
63
|
+
print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
62
64
|
else:
|
|
63
65
|
sanitized_columns.append(feature)
|
|
64
66
|
|
|
65
67
|
if ignore_columns is not None and use_columns is None:
|
|
66
68
|
missing_ignore = set(ignore_columns) - set(ground_truth_cols)
|
|
67
|
-
if missing_ignore:
|
|
69
|
+
if missing_ignore and verbose:
|
|
68
70
|
print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
|
|
69
71
|
sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
|
|
70
72
|
|
|
@@ -182,7 +184,7 @@ def compute_vif_multi(input_directory: str,
|
|
|
182
184
|
max_features_to_plot: int = 20,
|
|
183
185
|
fontsize: int = 14):
|
|
184
186
|
"""
|
|
185
|
-
Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames).
|
|
187
|
+
Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots or warnings will be displayed inline.
|
|
186
188
|
Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
|
|
187
189
|
|
|
188
190
|
Args:
|
|
@@ -210,10 +212,11 @@ def compute_vif_multi(input_directory: str,
|
|
|
210
212
|
fontsize=fontsize,
|
|
211
213
|
save_dir=output_plot_directory,
|
|
212
214
|
filename=df_name,
|
|
213
|
-
show_plot=False
|
|
215
|
+
show_plot=False,
|
|
216
|
+
verbose=False)
|
|
214
217
|
|
|
215
218
|
if output_dataset_directory is not None:
|
|
216
|
-
new_filename =
|
|
219
|
+
new_filename = df_name + '_VIF'
|
|
217
220
|
result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
|
|
218
221
|
|
|
219
222
|
if len(dropped_cols) > 0:
|
|
@@ -153,7 +153,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
153
153
|
|
|
154
154
|
result_df = df.drop(columns=cols_to_drop)
|
|
155
155
|
if show_nulls_after:
|
|
156
|
-
show_null_columns(df=result_df)
|
|
156
|
+
print(show_null_columns(df=result_df))
|
|
157
157
|
|
|
158
158
|
return result_df
|
|
159
159
|
else:
|
|
@@ -259,7 +259,7 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
|
|
|
259
259
|
os.makedirs(save_dir, exist_ok=True)
|
|
260
260
|
full_path = os.path.join(save_dir, plot_title + ".svg")
|
|
261
261
|
plt.savefig(full_path, bbox_inches="tight", format='svg')
|
|
262
|
-
print(f"Saved correlation heatmap
|
|
262
|
+
print(f"Saved correlation heatmap: '{plot_title}.svg'")
|
|
263
263
|
|
|
264
264
|
plt.show()
|
|
265
265
|
plt.close()
|
|
@@ -521,7 +521,8 @@ def clip_outliers_multi(
|
|
|
521
521
|
|
|
522
522
|
def distribute_datasets_by_target(
|
|
523
523
|
df: pd.DataFrame,
|
|
524
|
-
target_columns: list[str]
|
|
524
|
+
target_columns: list[str],
|
|
525
|
+
verbose: bool = False
|
|
525
526
|
) -> Iterator[Tuple[str, pd.DataFrame]]:
|
|
526
527
|
"""
|
|
527
528
|
Yields cleaned DataFrames for each target column, where rows with missing
|
|
@@ -533,6 +534,8 @@ def distribute_datasets_by_target(
|
|
|
533
534
|
Preprocessed dataframe with all feature and target columns ready to train.
|
|
534
535
|
target_columns : List[str]
|
|
535
536
|
List of target column names to generate per-target DataFrames.
|
|
537
|
+
verbose: bool
|
|
538
|
+
Whether to print info for each yielded dataset.
|
|
536
539
|
|
|
537
540
|
Yields
|
|
538
541
|
------
|
|
@@ -540,10 +543,13 @@ def distribute_datasets_by_target(
|
|
|
540
543
|
* First element is the target column name.
|
|
541
544
|
* Second element is the corresponding cleaned DataFrame.
|
|
542
545
|
"""
|
|
543
|
-
|
|
546
|
+
valid_targets = [col for col in df.columns if col in target_columns]
|
|
547
|
+
feature_columns = [col for col in df.columns if col not in valid_targets]
|
|
544
548
|
|
|
545
|
-
for target in
|
|
549
|
+
for target in valid_targets:
|
|
546
550
|
subset = df[feature_columns + [target]].dropna(subset=[target])
|
|
551
|
+
if verbose:
|
|
552
|
+
print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
|
|
547
553
|
yield target, subset
|
|
548
554
|
|
|
549
555
|
|
|
@@ -157,9 +157,7 @@ class RegressionTreeModels:
|
|
|
157
157
|
self.gamma = gamma
|
|
158
158
|
|
|
159
159
|
# LightGBM specific
|
|
160
|
-
|
|
161
|
-
num_leaves = (2**max_depth) - 1
|
|
162
|
-
print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
|
|
160
|
+
num_leaves = min(num_leaves, 2 ** (max_depth - 1))
|
|
163
161
|
self.num_leaves = num_leaves
|
|
164
162
|
self.min_data_in_leaf = min_data_in_leaf
|
|
165
163
|
|
|
@@ -202,7 +200,7 @@ class RegressionTreeModels:
|
|
|
202
200
|
verbose=-1,
|
|
203
201
|
reg_alpha=self.L1,
|
|
204
202
|
reg_lambda=self.L2,
|
|
205
|
-
boosting_type='
|
|
203
|
+
boosting_type='gbdt',
|
|
206
204
|
num_leaves=self.num_leaves,
|
|
207
205
|
min_data_in_leaf=self.min_data_in_leaf
|
|
208
206
|
)
|
|
@@ -321,9 +319,7 @@ class ClassificationTreeModels:
|
|
|
321
319
|
self.gamma = gamma
|
|
322
320
|
|
|
323
321
|
# LightGBM specific
|
|
324
|
-
|
|
325
|
-
num_leaves = (2**max_depth) - 1
|
|
326
|
-
print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
|
|
322
|
+
num_leaves = min(num_leaves, 2 ** (max_depth - 1))
|
|
327
323
|
self.num_leaves = num_leaves
|
|
328
324
|
self.min_data_in_leaf = min_data_in_leaf
|
|
329
325
|
|
|
@@ -370,7 +366,7 @@ class ClassificationTreeModels:
|
|
|
370
366
|
verbose=-1,
|
|
371
367
|
reg_alpha=self.L1,
|
|
372
368
|
reg_lambda=self.L2,
|
|
373
|
-
boosting_type='
|
|
369
|
+
boosting_type='gbdt' if self.use_model_balance else 'goss',
|
|
374
370
|
num_leaves=self.num_leaves,
|
|
375
371
|
min_data_in_leaf=self.min_data_in_leaf,
|
|
376
372
|
class_weight='balanced' if self.use_model_balance else None
|
|
@@ -55,7 +55,7 @@ def custom_logger(
|
|
|
55
55
|
"""
|
|
56
56
|
try:
|
|
57
57
|
os.makedirs(save_directory, exist_ok=True)
|
|
58
|
-
timestamp = datetime.now().strftime(r"%Y%m%d_%H%M")
|
|
58
|
+
timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
|
|
59
59
|
log_name = sanitize_filename(log_name)
|
|
60
60
|
base_path = os.path.join(save_directory, f"{log_name}_{timestamp}")
|
|
61
61
|
|
|
@@ -80,7 +80,7 @@ def custom_logger(
|
|
|
80
80
|
else:
|
|
81
81
|
raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
|
|
82
82
|
|
|
83
|
-
print(f"Log saved to: {base_path}")
|
|
83
|
+
print(f"Log saved to: '{base_path}'")
|
|
84
84
|
|
|
85
85
|
except Exception as e:
|
|
86
86
|
print(f"Error in custom_logger: {e}")
|
|
@@ -129,10 +129,10 @@ def run_pso(lower_boundaries: list[float],
|
|
|
129
129
|
target_name: Union[str, None]=None,
|
|
130
130
|
feature_names: Union[list[str], None]=None,
|
|
131
131
|
swarm_size: int=200,
|
|
132
|
-
max_iterations: int=
|
|
132
|
+
max_iterations: int=1500,
|
|
133
133
|
inequality_constrain_function=None,
|
|
134
|
-
post_hoc_analysis: Optional[int]=
|
|
135
|
-
workers: int=
|
|
134
|
+
post_hoc_analysis: Optional[int]=5,
|
|
135
|
+
workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
136
136
|
"""
|
|
137
137
|
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
|
138
138
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/dragon_ml_toolbox.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.4}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|