dragon-ml-toolbox 8.1.0__py3-none-any.whl → 9.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-8.1.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/METADATA +5 -1
- dragon_ml_toolbox-9.0.0.dist-info/RECORD +35 -0
- ml_tools/ETL_engineering.py +216 -81
- ml_tools/GUI_tools.py +5 -5
- ml_tools/MICE_imputation.py +12 -8
- ml_tools/ML_callbacks.py +6 -3
- ml_tools/ML_datasetmaster.py +37 -20
- ml_tools/ML_evaluation.py +4 -4
- ml_tools/ML_evaluation_multi.py +26 -17
- ml_tools/ML_inference.py +30 -23
- ml_tools/ML_models.py +14 -14
- ml_tools/ML_optimization.py +4 -3
- ml_tools/ML_scaler.py +7 -7
- ml_tools/ML_trainer.py +17 -15
- ml_tools/PSO_optimization.py +16 -8
- ml_tools/RNN_forecast.py +1 -1
- ml_tools/SQL.py +22 -13
- ml_tools/VIF_factor.py +7 -6
- ml_tools/_logger.py +105 -7
- ml_tools/custom_logger.py +12 -8
- ml_tools/data_exploration.py +20 -15
- ml_tools/ensemble_evaluation.py +10 -6
- ml_tools/ensemble_inference.py +18 -18
- ml_tools/ensemble_learning.py +8 -5
- ml_tools/handle_excel.py +15 -11
- ml_tools/optimization_tools.py +3 -4
- ml_tools/path_manager.py +21 -15
- ml_tools/utilities.py +35 -26
- dragon_ml_toolbox-8.1.0.dist-info/RECORD +0 -36
- ml_tools/_ML_optimization_multi.py +0 -231
- {dragon_ml_toolbox-8.1.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-8.1.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-8.1.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-8.1.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/top_level.txt +0 -0
ml_tools/data_exploration.py
CHANGED
|
@@ -83,7 +83,8 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
|
|
|
83
83
|
A new DataFrame with the constant columns removed.
|
|
84
84
|
"""
|
|
85
85
|
if not isinstance(df, pd.DataFrame):
|
|
86
|
-
|
|
86
|
+
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
87
|
+
raise TypeError()
|
|
87
88
|
|
|
88
89
|
original_columns = set(df.columns)
|
|
89
90
|
cols_to_keep = []
|
|
@@ -136,7 +137,7 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
136
137
|
_LOGGER.info(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
|
|
137
138
|
df_clean = df_clean[~target_na]
|
|
138
139
|
else:
|
|
139
|
-
_LOGGER.info("
|
|
140
|
+
_LOGGER.info("No rows found where all targets are missing.")
|
|
140
141
|
else:
|
|
141
142
|
valid_targets = []
|
|
142
143
|
|
|
@@ -149,9 +150,9 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
149
150
|
_LOGGER.info(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
|
|
150
151
|
df_clean = df_clean.drop(index=rows_to_drop)
|
|
151
152
|
else:
|
|
152
|
-
_LOGGER.info(f"
|
|
153
|
+
_LOGGER.info(f"No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
|
|
153
154
|
else:
|
|
154
|
-
_LOGGER.warning("
|
|
155
|
+
_LOGGER.warning("No feature columns available to evaluate.")
|
|
155
156
|
|
|
156
157
|
return df_clean
|
|
157
158
|
|
|
@@ -211,7 +212,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
211
212
|
cols_to_drop = missing_fraction[missing_fraction > threshold].index
|
|
212
213
|
|
|
213
214
|
if len(cols_to_drop) > 0:
|
|
214
|
-
_LOGGER.info(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
215
|
+
_LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
215
216
|
print(list(cols_to_drop))
|
|
216
217
|
|
|
217
218
|
result_df = df.drop(columns=cols_to_drop)
|
|
@@ -339,7 +340,8 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
|
|
|
339
340
|
TypeError: If any column is not numeric.
|
|
340
341
|
"""
|
|
341
342
|
if not all(np.issubdtype(dtype, np.number) for dtype in df.dtypes):
|
|
342
|
-
|
|
343
|
+
_LOGGER.error("All columns must be numeric (int or float).")
|
|
344
|
+
raise TypeError()
|
|
343
345
|
|
|
344
346
|
binary_cols = []
|
|
345
347
|
continuous_cols = []
|
|
@@ -390,7 +392,7 @@ def plot_correlation_heatmap(df: pd.DataFrame,
|
|
|
390
392
|
"""
|
|
391
393
|
numeric_df = df.select_dtypes(include='number')
|
|
392
394
|
if numeric_df.empty:
|
|
393
|
-
_LOGGER.warning("
|
|
395
|
+
_LOGGER.warning("No numeric columns found. Heatmap not generated.")
|
|
394
396
|
return
|
|
395
397
|
|
|
396
398
|
corr = numeric_df.corr(method=method)
|
|
@@ -558,11 +560,11 @@ def clip_outliers_single(
|
|
|
558
560
|
None: if a problem with the dataframe column occurred.
|
|
559
561
|
"""
|
|
560
562
|
if column not in df.columns:
|
|
561
|
-
_LOGGER.warning(f"
|
|
563
|
+
_LOGGER.warning(f"Column '{column}' not found in DataFrame.")
|
|
562
564
|
return None
|
|
563
565
|
|
|
564
566
|
if not pd.api.types.is_numeric_dtype(df[column]):
|
|
565
|
-
_LOGGER.warning(f"
|
|
567
|
+
_LOGGER.warning(f"Column '{column}' must be numeric.")
|
|
566
568
|
return None
|
|
567
569
|
|
|
568
570
|
new_df = df.copy(deep=True)
|
|
@@ -600,13 +602,16 @@ def clip_outliers_multi(
|
|
|
600
602
|
for col, bounds in clip_dict.items():
|
|
601
603
|
try:
|
|
602
604
|
if col not in df.columns:
|
|
603
|
-
|
|
605
|
+
_LOGGER.error(f"Column '{col}' not found in DataFrame.")
|
|
606
|
+
raise ValueError()
|
|
604
607
|
|
|
605
608
|
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
606
|
-
|
|
609
|
+
_LOGGER.error(f"Column '{col}' is not numeric.")
|
|
610
|
+
raise TypeError()
|
|
607
611
|
|
|
608
612
|
if not (isinstance(bounds, tuple) and len(bounds) == 2):
|
|
609
|
-
|
|
613
|
+
_LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
|
|
614
|
+
raise ValueError()
|
|
610
615
|
|
|
611
616
|
min_val, max_val = bounds
|
|
612
617
|
new_df[col] = new_df[col].clip(lower=min_val, upper=max_val)
|
|
@@ -621,7 +626,7 @@ def clip_outliers_multi(
|
|
|
621
626
|
_LOGGER.info(f"Clipped {clipped_columns} columns.")
|
|
622
627
|
|
|
623
628
|
if skipped_columns:
|
|
624
|
-
_LOGGER.warning("
|
|
629
|
+
_LOGGER.warning("Skipped columns:")
|
|
625
630
|
for col, msg in skipped_columns:
|
|
626
631
|
print(f" - {col}: {msg}")
|
|
627
632
|
|
|
@@ -707,11 +712,11 @@ def standardize_percentages(
|
|
|
707
712
|
for col in columns:
|
|
708
713
|
# --- Robustness Checks ---
|
|
709
714
|
if col not in df_copy.columns:
|
|
710
|
-
_LOGGER.warning(f"
|
|
715
|
+
_LOGGER.warning(f"Column '{col}' not found. Skipping.")
|
|
711
716
|
continue
|
|
712
717
|
|
|
713
718
|
if not is_numeric_dtype(df_copy[col]):
|
|
714
|
-
_LOGGER.warning(f"
|
|
719
|
+
_LOGGER.warning(f"Column '{col}' is not numeric. Skipping.")
|
|
715
720
|
continue
|
|
716
721
|
|
|
717
722
|
# --- Applying the Logic ---
|
ml_tools/ensemble_evaluation.py
CHANGED
|
@@ -119,8 +119,8 @@ def evaluate_model_classification(
|
|
|
119
119
|
heatmap_path = save_path / f"Classification_Report_{sanitized_target_name}.svg"
|
|
120
120
|
plt.savefig(heatmap_path, format="svg", bbox_inches="tight")
|
|
121
121
|
plt.close()
|
|
122
|
-
except Exception
|
|
123
|
-
_LOGGER.
|
|
122
|
+
except Exception:
|
|
123
|
+
_LOGGER.exception(f"Could not generate classification report heatmap for {target_name}:")
|
|
124
124
|
|
|
125
125
|
# Create confusion matrix
|
|
126
126
|
fig, ax = plt.subplots(figsize=figsize)
|
|
@@ -198,7 +198,8 @@ def plot_roc_curve(
|
|
|
198
198
|
|
|
199
199
|
elif hasattr(probabilities_or_model, "predict_proba"):
|
|
200
200
|
if input_features is None:
|
|
201
|
-
|
|
201
|
+
_LOGGER.error("input_features must be provided when using a classifier.")
|
|
202
|
+
raise ValueError()
|
|
202
203
|
|
|
203
204
|
try:
|
|
204
205
|
classes = probabilities_or_model.classes_ # type: ignore
|
|
@@ -209,7 +210,8 @@ def plot_roc_curve(
|
|
|
209
210
|
y_score = probabilities_or_model.predict_proba(input_features)[:, positive_class_index] # type: ignore
|
|
210
211
|
|
|
211
212
|
else:
|
|
212
|
-
|
|
213
|
+
_LOGGER.error("Unsupported type for 'probabilities_or_model'. Must be a NumPy array or a model with support for '.predict_proba()'.")
|
|
214
|
+
raise TypeError()
|
|
213
215
|
|
|
214
216
|
# ROC and AUC
|
|
215
217
|
fpr, tpr, _ = roc_curve(true_labels, y_score)
|
|
@@ -276,7 +278,8 @@ def plot_precision_recall_curve(
|
|
|
276
278
|
|
|
277
279
|
elif hasattr(probabilities_or_model, "predict_proba"):
|
|
278
280
|
if input_features is None:
|
|
279
|
-
|
|
281
|
+
_LOGGER.error("input_features must be provided when using a classifier.")
|
|
282
|
+
raise ValueError()
|
|
280
283
|
try:
|
|
281
284
|
classes = probabilities_or_model.classes_ # type: ignore
|
|
282
285
|
positive_class_index = list(classes).index(1)
|
|
@@ -284,7 +287,8 @@ def plot_precision_recall_curve(
|
|
|
284
287
|
positive_class_index = 1
|
|
285
288
|
y_score = probabilities_or_model.predict_proba(input_features)[:, positive_class_index] # type: ignore
|
|
286
289
|
else:
|
|
287
|
-
|
|
290
|
+
_LOGGER.error("Unsupported type for 'probabilities_or_model'. Must be a NumPy array or a model with support for '.predict_proba()'.")
|
|
291
|
+
raise TypeError()
|
|
288
292
|
|
|
289
293
|
# Calculate PR curve and AP score
|
|
290
294
|
precision, recall, _ = precision_recall_curve(true_labels, y_score)
|
ml_tools/ensemble_inference.py
CHANGED
|
@@ -59,15 +59,15 @@ class InferenceHandler:
|
|
|
59
59
|
self._feature_names = feature_names_list
|
|
60
60
|
elif self._feature_names != feature_names_list:
|
|
61
61
|
# Add a warning if subsequent models have different feature names.
|
|
62
|
-
_LOGGER.warning(f"
|
|
62
|
+
_LOGGER.warning(f"Mismatched feature names in {fname}. Using feature order from the first model loaded.")
|
|
63
63
|
|
|
64
64
|
self.models[target_name] = model
|
|
65
65
|
if self.verbose:
|
|
66
|
-
_LOGGER.info(f"
|
|
66
|
+
_LOGGER.info(f"Loaded model for target: {target_name}")
|
|
67
|
+
|
|
68
|
+
except Exception:
|
|
69
|
+
_LOGGER.error(f"Failed to load or parse {fname}.")
|
|
67
70
|
|
|
68
|
-
except Exception as e:
|
|
69
|
-
_LOGGER.warning(f"⚠️ Failed to load or parse {fname}: {e}")
|
|
70
|
-
|
|
71
71
|
@property
|
|
72
72
|
def feature_names(self) -> List[str]:
|
|
73
73
|
"""
|
|
@@ -92,7 +92,8 @@ class InferenceHandler:
|
|
|
92
92
|
features = features.reshape(1, -1)
|
|
93
93
|
|
|
94
94
|
if features.shape[0] != 1:
|
|
95
|
-
|
|
95
|
+
_LOGGER.error("The 'predict()' method is for a single sample. Use 'predict_batch()' for multiple samples.")
|
|
96
|
+
raise ValueError()
|
|
96
97
|
|
|
97
98
|
results: Dict[str, Any] = dict()
|
|
98
99
|
for target_name, model in self.models.items():
|
|
@@ -106,7 +107,7 @@ class InferenceHandler:
|
|
|
106
107
|
EnsembleKeys.CLASSIFICATION_PROBABILITIES: probabilities}
|
|
107
108
|
|
|
108
109
|
if self.verbose:
|
|
109
|
-
_LOGGER.info("
|
|
110
|
+
_LOGGER.info("Inference process complete.")
|
|
110
111
|
return results
|
|
111
112
|
|
|
112
113
|
def predict_batch(self, features: np.ndarray) -> Dict[str, Any]:
|
|
@@ -122,7 +123,8 @@ class InferenceHandler:
|
|
|
122
123
|
- For classification: The value is another dictionary {'labels': ..., 'probabilities': ...}.
|
|
123
124
|
"""
|
|
124
125
|
if features.ndim != 2:
|
|
125
|
-
|
|
126
|
+
_LOGGER.error("Input for batch prediction must be a 2D array.")
|
|
127
|
+
raise ValueError()
|
|
126
128
|
|
|
127
129
|
results: Dict[str, Any] = dict()
|
|
128
130
|
for target_name, model in self.models.items():
|
|
@@ -134,7 +136,7 @@ class InferenceHandler:
|
|
|
134
136
|
results[target_name] = {"labels": labels, "probabilities": probabilities}
|
|
135
137
|
|
|
136
138
|
if self.verbose:
|
|
137
|
-
_LOGGER.info("
|
|
139
|
+
_LOGGER.info("Inference process complete.")
|
|
138
140
|
|
|
139
141
|
return results
|
|
140
142
|
|
|
@@ -174,11 +176,11 @@ def model_report(
|
|
|
174
176
|
target = full_object[EnsembleKeys.TARGET]
|
|
175
177
|
features = full_object[EnsembleKeys.FEATURES]
|
|
176
178
|
except FileNotFoundError:
|
|
177
|
-
_LOGGER.error(f"
|
|
179
|
+
_LOGGER.error(f"Model file not found at '{model_p}'")
|
|
178
180
|
raise
|
|
179
181
|
except (KeyError, TypeError) as e:
|
|
180
182
|
_LOGGER.error(
|
|
181
|
-
f"
|
|
183
|
+
f"The serialized object is missing required keys '{EnsembleKeys.MODEL}', '{EnsembleKeys.TARGET}', '{EnsembleKeys.FEATURES}'"
|
|
182
184
|
)
|
|
183
185
|
raise e
|
|
184
186
|
|
|
@@ -209,9 +211,9 @@ def model_report(
|
|
|
209
211
|
with open(json_filepath, 'w') as f:
|
|
210
212
|
json.dump(report_data, f, indent=4)
|
|
211
213
|
if verbose:
|
|
212
|
-
_LOGGER.info(f"
|
|
214
|
+
_LOGGER.info(f"JSON report saved to: '{json_filepath}'")
|
|
213
215
|
except PermissionError:
|
|
214
|
-
_LOGGER.
|
|
216
|
+
_LOGGER.exception(f"Permission denied to write JSON report at '{json_filepath}'.")
|
|
215
217
|
|
|
216
218
|
# --- 5. Return the extracted data ---
|
|
217
219
|
return report_data
|
|
@@ -233,15 +235,13 @@ def _deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_
|
|
|
233
235
|
try:
|
|
234
236
|
obj = joblib.load(true_filepath)
|
|
235
237
|
except (IOError, OSError, EOFError, TypeError, ValueError) as e:
|
|
236
|
-
|
|
238
|
+
_LOGGER.error(f"Failed to deserialize object from '{true_filepath}'.")
|
|
237
239
|
if raise_on_error:
|
|
238
|
-
raise
|
|
239
|
-
else:
|
|
240
|
-
print(message)
|
|
240
|
+
raise e
|
|
241
241
|
return None
|
|
242
242
|
else:
|
|
243
243
|
if verbose:
|
|
244
|
-
|
|
244
|
+
_LOGGER.info(f"Loaded object of type '{type(obj)}'")
|
|
245
245
|
return obj
|
|
246
246
|
|
|
247
247
|
|
ml_tools/ensemble_learning.py
CHANGED
|
@@ -339,7 +339,8 @@ def _resample(X_train: np.ndarray, y_train: pd.Series,
|
|
|
339
339
|
elif strategy == 'ADASYN':
|
|
340
340
|
resample_algorithm = ADASYN(random_state=random_state, n_neighbors=3)
|
|
341
341
|
else:
|
|
342
|
-
|
|
342
|
+
_LOGGER.error(f"Invalid resampling strategy: {strategy}")
|
|
343
|
+
raise ValueError()
|
|
343
344
|
|
|
344
345
|
X_res, y_res, *_ = resample_algorithm.fit_resample(X_train, y_train)
|
|
345
346
|
return X_res, y_res
|
|
@@ -459,7 +460,8 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
459
460
|
y_pred = evaluate_model_regression(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
460
461
|
x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
|
|
461
462
|
else:
|
|
462
|
-
|
|
463
|
+
_LOGGER.error(f"Unrecognized task '{task}' for model training,")
|
|
464
|
+
raise ValueError()
|
|
463
465
|
if debug:
|
|
464
466
|
_LOGGER.info(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
|
|
465
467
|
|
|
@@ -487,13 +489,14 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
|
|
|
487
489
|
elif isinstance(model_object, ClassificationTreeModels):
|
|
488
490
|
task = "classification"
|
|
489
491
|
if handle_classification_imbalance is None:
|
|
490
|
-
_LOGGER.warning("
|
|
492
|
+
_LOGGER.warning("No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
|
|
491
493
|
elif handle_classification_imbalance == "by_model":
|
|
492
494
|
model_object.use_model_balance = True
|
|
493
495
|
else:
|
|
494
496
|
model_object.use_model_balance = False
|
|
495
497
|
else:
|
|
496
|
-
|
|
498
|
+
_LOGGER.error(f"Unrecognized model {type(model_object)}")
|
|
499
|
+
raise TypeError()
|
|
497
500
|
|
|
498
501
|
#Check paths
|
|
499
502
|
datasets_path = make_fullpath(datasets_dir)
|
|
@@ -519,7 +522,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
|
|
|
519
522
|
debug=debug, save_dir=save_path, save_model=save_model,
|
|
520
523
|
generate_learning_curves=generate_learning_curves)
|
|
521
524
|
|
|
522
|
-
_LOGGER.info("
|
|
525
|
+
_LOGGER.info("Training and evaluation complete.")
|
|
523
526
|
|
|
524
527
|
|
|
525
528
|
def info():
|
ml_tools/handle_excel.py
CHANGED
|
@@ -37,7 +37,8 @@ def find_excel_files(
|
|
|
37
37
|
input_path = make_fullpath(directory)
|
|
38
38
|
|
|
39
39
|
if not input_path.is_dir():
|
|
40
|
-
|
|
40
|
+
_LOGGER.error(f"Directory not found: {input_path}")
|
|
41
|
+
raise NotADirectoryError()
|
|
41
42
|
|
|
42
43
|
excel_files = [
|
|
43
44
|
f for f in input_path.iterdir()
|
|
@@ -47,7 +48,8 @@ def find_excel_files(
|
|
|
47
48
|
]
|
|
48
49
|
|
|
49
50
|
if not excel_files:
|
|
50
|
-
|
|
51
|
+
_LOGGER.error(f"No valid Excel files found in directory: {input_path}")
|
|
52
|
+
raise FileNotFoundError()
|
|
51
53
|
|
|
52
54
|
return excel_files
|
|
53
55
|
|
|
@@ -99,7 +101,7 @@ def unmerge_and_split_excel(filepath: Union[str,Path]) -> None:
|
|
|
99
101
|
|
|
100
102
|
total_output_files += 1
|
|
101
103
|
|
|
102
|
-
_LOGGER.info(f"
|
|
104
|
+
_LOGGER.info(f"Processed file: {file_path} into {total_output_files} output file(s).")
|
|
103
105
|
return None
|
|
104
106
|
|
|
105
107
|
|
|
@@ -155,7 +157,7 @@ def unmerge_and_split_from_directory(input_dir: Union[str,Path], output_dir: Uni
|
|
|
155
157
|
|
|
156
158
|
total_output_files += 1
|
|
157
159
|
|
|
158
|
-
_LOGGER.info(f"
|
|
160
|
+
_LOGGER.info(f"Processed {len(excel_files)} input Excel file(s) with a total of {total_output_files} output Excel file(s).")
|
|
159
161
|
return None
|
|
160
162
|
|
|
161
163
|
|
|
@@ -199,13 +201,13 @@ def validate_excel_schema(
|
|
|
199
201
|
invalid_files.append(file)
|
|
200
202
|
|
|
201
203
|
except Exception as e:
|
|
202
|
-
_LOGGER.error(f"
|
|
204
|
+
_LOGGER.error(f"Error processing '{file}': {e}")
|
|
203
205
|
invalid_files.append(file)
|
|
204
206
|
|
|
205
207
|
valid_excel_number = len(excel_paths) - len(invalid_files)
|
|
206
208
|
_LOGGER.info(f"{valid_excel_number} out of {len(excel_paths)} excel files conform to the schema.")
|
|
207
209
|
if invalid_files:
|
|
208
|
-
_LOGGER.warning(f"
|
|
210
|
+
_LOGGER.warning(f"{len(invalid_files)} excel files are invalid:")
|
|
209
211
|
for in_file in invalid_files:
|
|
210
212
|
print(f" - {in_file.name}")
|
|
211
213
|
|
|
@@ -252,7 +254,8 @@ def vertical_merge_transform_excel(
|
|
|
252
254
|
if target_columns is not None:
|
|
253
255
|
missing = [col for col in target_columns if col not in df.columns]
|
|
254
256
|
if missing:
|
|
255
|
-
|
|
257
|
+
_LOGGER.error(f"Invalid columns in {file.name}: {missing}")
|
|
258
|
+
raise ValueError()
|
|
256
259
|
df = df[target_columns]
|
|
257
260
|
|
|
258
261
|
dataframes.append(df)
|
|
@@ -262,11 +265,12 @@ def vertical_merge_transform_excel(
|
|
|
262
265
|
if rename_columns is not None:
|
|
263
266
|
expected_len = len(target_columns if target_columns is not None else merged_df.columns)
|
|
264
267
|
if len(rename_columns) != expected_len:
|
|
265
|
-
|
|
268
|
+
_LOGGER.error("Length of 'rename_columns' must match the selected columns")
|
|
269
|
+
raise ValueError()
|
|
266
270
|
merged_df.columns = rename_columns
|
|
267
271
|
|
|
268
272
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
269
|
-
_LOGGER.info(f"
|
|
273
|
+
_LOGGER.info(f"Merged {len(dataframes)} excel files into '{csv_filename}'.")
|
|
270
274
|
|
|
271
275
|
|
|
272
276
|
def horizontal_merge_transform_excel(
|
|
@@ -327,7 +331,7 @@ def horizontal_merge_transform_excel(
|
|
|
327
331
|
duplicate_columns = merged_df.columns[merged_df.columns.duplicated()].tolist()
|
|
328
332
|
|
|
329
333
|
if duplicate_columns:
|
|
330
|
-
_LOGGER.warning(f"
|
|
334
|
+
_LOGGER.warning(f"Duplicate columns: {duplicate_columns}")
|
|
331
335
|
|
|
332
336
|
if skip_duplicates:
|
|
333
337
|
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
|
|
@@ -347,7 +351,7 @@ def horizontal_merge_transform_excel(
|
|
|
347
351
|
|
|
348
352
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
349
353
|
|
|
350
|
-
_LOGGER.info(f"
|
|
354
|
+
_LOGGER.info(f"Merged {len(excel_files)} Excel files into '{csv_filename}'.")
|
|
351
355
|
|
|
352
356
|
|
|
353
357
|
def info():
|
ml_tools/optimization_tools.py
CHANGED
|
@@ -61,7 +61,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
|
|
|
61
61
|
|
|
62
62
|
long_df = pd.concat(data_to_plot, ignore_index=True)
|
|
63
63
|
features = long_df['feature'].unique()
|
|
64
|
-
_LOGGER.info(f"
|
|
64
|
+
_LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
|
|
65
65
|
|
|
66
66
|
# --- Plotting Loop ---
|
|
67
67
|
for feature_name in features:
|
|
@@ -105,7 +105,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
|
|
|
105
105
|
plt.savefig(plot_filename, bbox_inches='tight')
|
|
106
106
|
plt.close()
|
|
107
107
|
|
|
108
|
-
_LOGGER.info(f"
|
|
108
|
+
_LOGGER.info(f"All plots saved successfully to: '{output_path}'")
|
|
109
109
|
|
|
110
110
|
|
|
111
111
|
def _save_result(
|
|
@@ -129,8 +129,7 @@ def _save_result(
|
|
|
129
129
|
if db_manager and db_table_name:
|
|
130
130
|
db_manager.insert_row(db_table_name, result_dict)
|
|
131
131
|
else:
|
|
132
|
-
_LOGGER.warning("
|
|
133
|
-
|
|
132
|
+
_LOGGER.warning("SQLite saving requested but db_manager or table_name not provided.")
|
|
134
133
|
|
|
135
134
|
|
|
136
135
|
def info():
|
ml_tools/path_manager.py
CHANGED
|
@@ -88,7 +88,7 @@ class PathManager:
|
|
|
88
88
|
try:
|
|
89
89
|
return self._paths[key]
|
|
90
90
|
except KeyError:
|
|
91
|
-
_LOGGER.error(f"
|
|
91
|
+
_LOGGER.error(f"Path key '{key}' not found.")
|
|
92
92
|
raise
|
|
93
93
|
|
|
94
94
|
def update(self, new_paths: Dict[str, Union[str, Path]], overwrite: bool = False) -> None:
|
|
@@ -106,9 +106,8 @@ class PathManager:
|
|
|
106
106
|
if not overwrite:
|
|
107
107
|
for key in new_paths:
|
|
108
108
|
if key in self._paths:
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
)
|
|
109
|
+
_LOGGER.error(f"Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True.")
|
|
110
|
+
raise KeyError
|
|
112
111
|
|
|
113
112
|
# Resolve any string paths to Path objects before storing
|
|
114
113
|
resolved_new_paths = {k: Path(v) for k, v in new_paths.items()}
|
|
@@ -136,7 +135,7 @@ class PathManager:
|
|
|
136
135
|
if key in self._paths:
|
|
137
136
|
path_items.append((key, self._paths[key]))
|
|
138
137
|
elif verbose:
|
|
139
|
-
_LOGGER.warning(f"
|
|
138
|
+
_LOGGER.warning(f"Key '{key}' not found in PathManager, skipping.")
|
|
140
139
|
else:
|
|
141
140
|
path_items = self._paths.items()
|
|
142
141
|
|
|
@@ -153,7 +152,7 @@ class PathManager:
|
|
|
153
152
|
|
|
154
153
|
if self._is_bundled and is_internal_path:
|
|
155
154
|
if verbose:
|
|
156
|
-
_LOGGER.warning(f"
|
|
155
|
+
_LOGGER.warning(f"Skipping internal directory '{key}' in bundled app (read-only).")
|
|
157
156
|
continue
|
|
158
157
|
# -------------------------
|
|
159
158
|
|
|
@@ -261,7 +260,8 @@ def make_fullpath(
|
|
|
261
260
|
resolved = path.resolve(strict=True)
|
|
262
261
|
except FileNotFoundError:
|
|
263
262
|
if not make:
|
|
264
|
-
|
|
263
|
+
_LOGGER.error(f"Path does not exist: '{path}'.")
|
|
264
|
+
raise FileNotFoundError()
|
|
265
265
|
|
|
266
266
|
try:
|
|
267
267
|
if is_file:
|
|
@@ -271,14 +271,17 @@ def make_fullpath(
|
|
|
271
271
|
else:
|
|
272
272
|
path.mkdir(parents=True, exist_ok=True)
|
|
273
273
|
resolved = path.resolve(strict=True)
|
|
274
|
-
except Exception
|
|
275
|
-
|
|
274
|
+
except Exception:
|
|
275
|
+
_LOGGER.exception(f"Failed to create {'file' if is_file else 'directory'} '{path}'.")
|
|
276
|
+
raise IOError()
|
|
276
277
|
|
|
277
278
|
if enforce == "file" and not resolved.is_file():
|
|
278
|
-
|
|
279
|
+
_LOGGER.error(f"Path was enforced as a file, but it is not: '{resolved}'")
|
|
280
|
+
raise TypeError()
|
|
279
281
|
|
|
280
282
|
if enforce == "directory" and not resolved.is_dir():
|
|
281
|
-
|
|
283
|
+
_LOGGER.error(f"Path was enforced as a directory, but it is not: '{resolved}'")
|
|
284
|
+
raise TypeError()
|
|
282
285
|
|
|
283
286
|
if verbose:
|
|
284
287
|
if resolved.is_file():
|
|
@@ -315,7 +318,8 @@ def sanitize_filename(filename: str) -> str:
|
|
|
315
318
|
|
|
316
319
|
# Check for empty string after sanitization
|
|
317
320
|
if not sanitized:
|
|
318
|
-
|
|
321
|
+
_LOGGER.error("The sanitized filename is empty. The original input may have contained only invalid characters.")
|
|
322
|
+
raise ValueError()
|
|
319
323
|
|
|
320
324
|
return sanitized
|
|
321
325
|
|
|
@@ -334,7 +338,8 @@ def list_csv_paths(directory: Union[str,Path], verbose: bool=True) -> dict[str,
|
|
|
334
338
|
|
|
335
339
|
csv_paths = list(dir_path.glob("*.csv"))
|
|
336
340
|
if not csv_paths:
|
|
337
|
-
|
|
341
|
+
_LOGGER.error(f"No CSV files found in directory: {dir_path.name}")
|
|
342
|
+
raise IOError()
|
|
338
343
|
|
|
339
344
|
# make a dictionary of paths and names
|
|
340
345
|
name_path_dict = {p.stem: p for p in csv_paths}
|
|
@@ -367,12 +372,13 @@ def list_files_by_extension(directory: Union[str,Path], extension: str, verbose:
|
|
|
367
372
|
|
|
368
373
|
matched_paths = list(dir_path.glob(pattern))
|
|
369
374
|
if not matched_paths:
|
|
370
|
-
|
|
375
|
+
_LOGGER.error(f"No '.{normalized_ext}' files found in directory: {dir_path}.")
|
|
376
|
+
raise IOError()
|
|
371
377
|
|
|
372
378
|
name_path_dict = {p.stem: p for p in matched_paths}
|
|
373
379
|
|
|
374
380
|
if verbose:
|
|
375
|
-
_LOGGER.info(f"
|
|
381
|
+
_LOGGER.info(f"📂 '{normalized_ext.upper()}' files found:")
|
|
376
382
|
for name in name_path_dict:
|
|
377
383
|
print(f"\t{name}")
|
|
378
384
|
|