dragon-ml-toolbox 8.2.0__py3-none-any.whl → 9.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/METADATA +5 -1
- dragon_ml_toolbox-9.0.0.dist-info/RECORD +35 -0
- ml_tools/ETL_engineering.py +177 -79
- ml_tools/GUI_tools.py +5 -5
- ml_tools/MICE_imputation.py +12 -8
- ml_tools/ML_callbacks.py +6 -3
- ml_tools/ML_datasetmaster.py +37 -20
- ml_tools/ML_evaluation.py +4 -4
- ml_tools/ML_evaluation_multi.py +26 -17
- ml_tools/ML_inference.py +30 -23
- ml_tools/ML_models.py +14 -14
- ml_tools/ML_optimization.py +4 -3
- ml_tools/ML_scaler.py +7 -7
- ml_tools/ML_trainer.py +17 -15
- ml_tools/PSO_optimization.py +16 -8
- ml_tools/RNN_forecast.py +1 -1
- ml_tools/SQL.py +22 -13
- ml_tools/VIF_factor.py +7 -6
- ml_tools/_logger.py +105 -7
- ml_tools/custom_logger.py +12 -8
- ml_tools/data_exploration.py +20 -15
- ml_tools/ensemble_evaluation.py +10 -6
- ml_tools/ensemble_inference.py +18 -18
- ml_tools/ensemble_learning.py +8 -5
- ml_tools/handle_excel.py +15 -11
- ml_tools/optimization_tools.py +3 -4
- ml_tools/path_manager.py +21 -15
- ml_tools/utilities.py +35 -26
- dragon_ml_toolbox-8.2.0.dist-info/RECORD +0 -36
- ml_tools/_ML_optimization_multi.py +0 -231
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/top_level.txt +0 -0
ml_tools/ensemble_evaluation.py
CHANGED
|
@@ -119,8 +119,8 @@ def evaluate_model_classification(
|
|
|
119
119
|
heatmap_path = save_path / f"Classification_Report_{sanitized_target_name}.svg"
|
|
120
120
|
plt.savefig(heatmap_path, format="svg", bbox_inches="tight")
|
|
121
121
|
plt.close()
|
|
122
|
-
except Exception
|
|
123
|
-
_LOGGER.
|
|
122
|
+
except Exception:
|
|
123
|
+
_LOGGER.exception(f"Could not generate classification report heatmap for {target_name}:")
|
|
124
124
|
|
|
125
125
|
# Create confusion matrix
|
|
126
126
|
fig, ax = plt.subplots(figsize=figsize)
|
|
@@ -198,7 +198,8 @@ def plot_roc_curve(
|
|
|
198
198
|
|
|
199
199
|
elif hasattr(probabilities_or_model, "predict_proba"):
|
|
200
200
|
if input_features is None:
|
|
201
|
-
|
|
201
|
+
_LOGGER.error("input_features must be provided when using a classifier.")
|
|
202
|
+
raise ValueError()
|
|
202
203
|
|
|
203
204
|
try:
|
|
204
205
|
classes = probabilities_or_model.classes_ # type: ignore
|
|
@@ -209,7 +210,8 @@ def plot_roc_curve(
|
|
|
209
210
|
y_score = probabilities_or_model.predict_proba(input_features)[:, positive_class_index] # type: ignore
|
|
210
211
|
|
|
211
212
|
else:
|
|
212
|
-
|
|
213
|
+
_LOGGER.error("Unsupported type for 'probabilities_or_model'. Must be a NumPy array or a model with support for '.predict_proba()'.")
|
|
214
|
+
raise TypeError()
|
|
213
215
|
|
|
214
216
|
# ROC and AUC
|
|
215
217
|
fpr, tpr, _ = roc_curve(true_labels, y_score)
|
|
@@ -276,7 +278,8 @@ def plot_precision_recall_curve(
|
|
|
276
278
|
|
|
277
279
|
elif hasattr(probabilities_or_model, "predict_proba"):
|
|
278
280
|
if input_features is None:
|
|
279
|
-
|
|
281
|
+
_LOGGER.error("input_features must be provided when using a classifier.")
|
|
282
|
+
raise ValueError()
|
|
280
283
|
try:
|
|
281
284
|
classes = probabilities_or_model.classes_ # type: ignore
|
|
282
285
|
positive_class_index = list(classes).index(1)
|
|
@@ -284,7 +287,8 @@ def plot_precision_recall_curve(
|
|
|
284
287
|
positive_class_index = 1
|
|
285
288
|
y_score = probabilities_or_model.predict_proba(input_features)[:, positive_class_index] # type: ignore
|
|
286
289
|
else:
|
|
287
|
-
|
|
290
|
+
_LOGGER.error("Unsupported type for 'probabilities_or_model'. Must be a NumPy array or a model with support for '.predict_proba()'.")
|
|
291
|
+
raise TypeError()
|
|
288
292
|
|
|
289
293
|
# Calculate PR curve and AP score
|
|
290
294
|
precision, recall, _ = precision_recall_curve(true_labels, y_score)
|
ml_tools/ensemble_inference.py
CHANGED
|
@@ -59,15 +59,15 @@ class InferenceHandler:
|
|
|
59
59
|
self._feature_names = feature_names_list
|
|
60
60
|
elif self._feature_names != feature_names_list:
|
|
61
61
|
# Add a warning if subsequent models have different feature names.
|
|
62
|
-
_LOGGER.warning(f"
|
|
62
|
+
_LOGGER.warning(f"Mismatched feature names in {fname}. Using feature order from the first model loaded.")
|
|
63
63
|
|
|
64
64
|
self.models[target_name] = model
|
|
65
65
|
if self.verbose:
|
|
66
|
-
_LOGGER.info(f"
|
|
66
|
+
_LOGGER.info(f"Loaded model for target: {target_name}")
|
|
67
|
+
|
|
68
|
+
except Exception:
|
|
69
|
+
_LOGGER.error(f"Failed to load or parse {fname}.")
|
|
67
70
|
|
|
68
|
-
except Exception as e:
|
|
69
|
-
_LOGGER.warning(f"⚠️ Failed to load or parse {fname}: {e}")
|
|
70
|
-
|
|
71
71
|
@property
|
|
72
72
|
def feature_names(self) -> List[str]:
|
|
73
73
|
"""
|
|
@@ -92,7 +92,8 @@ class InferenceHandler:
|
|
|
92
92
|
features = features.reshape(1, -1)
|
|
93
93
|
|
|
94
94
|
if features.shape[0] != 1:
|
|
95
|
-
|
|
95
|
+
_LOGGER.error("The 'predict()' method is for a single sample. Use 'predict_batch()' for multiple samples.")
|
|
96
|
+
raise ValueError()
|
|
96
97
|
|
|
97
98
|
results: Dict[str, Any] = dict()
|
|
98
99
|
for target_name, model in self.models.items():
|
|
@@ -106,7 +107,7 @@ class InferenceHandler:
|
|
|
106
107
|
EnsembleKeys.CLASSIFICATION_PROBABILITIES: probabilities}
|
|
107
108
|
|
|
108
109
|
if self.verbose:
|
|
109
|
-
_LOGGER.info("
|
|
110
|
+
_LOGGER.info("Inference process complete.")
|
|
110
111
|
return results
|
|
111
112
|
|
|
112
113
|
def predict_batch(self, features: np.ndarray) -> Dict[str, Any]:
|
|
@@ -122,7 +123,8 @@ class InferenceHandler:
|
|
|
122
123
|
- For classification: The value is another dictionary {'labels': ..., 'probabilities': ...}.
|
|
123
124
|
"""
|
|
124
125
|
if features.ndim != 2:
|
|
125
|
-
|
|
126
|
+
_LOGGER.error("Input for batch prediction must be a 2D array.")
|
|
127
|
+
raise ValueError()
|
|
126
128
|
|
|
127
129
|
results: Dict[str, Any] = dict()
|
|
128
130
|
for target_name, model in self.models.items():
|
|
@@ -134,7 +136,7 @@ class InferenceHandler:
|
|
|
134
136
|
results[target_name] = {"labels": labels, "probabilities": probabilities}
|
|
135
137
|
|
|
136
138
|
if self.verbose:
|
|
137
|
-
_LOGGER.info("
|
|
139
|
+
_LOGGER.info("Inference process complete.")
|
|
138
140
|
|
|
139
141
|
return results
|
|
140
142
|
|
|
@@ -174,11 +176,11 @@ def model_report(
|
|
|
174
176
|
target = full_object[EnsembleKeys.TARGET]
|
|
175
177
|
features = full_object[EnsembleKeys.FEATURES]
|
|
176
178
|
except FileNotFoundError:
|
|
177
|
-
_LOGGER.error(f"
|
|
179
|
+
_LOGGER.error(f"Model file not found at '{model_p}'")
|
|
178
180
|
raise
|
|
179
181
|
except (KeyError, TypeError) as e:
|
|
180
182
|
_LOGGER.error(
|
|
181
|
-
f"
|
|
183
|
+
f"The serialized object is missing required keys '{EnsembleKeys.MODEL}', '{EnsembleKeys.TARGET}', '{EnsembleKeys.FEATURES}'"
|
|
182
184
|
)
|
|
183
185
|
raise e
|
|
184
186
|
|
|
@@ -209,9 +211,9 @@ def model_report(
|
|
|
209
211
|
with open(json_filepath, 'w') as f:
|
|
210
212
|
json.dump(report_data, f, indent=4)
|
|
211
213
|
if verbose:
|
|
212
|
-
_LOGGER.info(f"
|
|
214
|
+
_LOGGER.info(f"JSON report saved to: '{json_filepath}'")
|
|
213
215
|
except PermissionError:
|
|
214
|
-
_LOGGER.
|
|
216
|
+
_LOGGER.exception(f"Permission denied to write JSON report at '{json_filepath}'.")
|
|
215
217
|
|
|
216
218
|
# --- 5. Return the extracted data ---
|
|
217
219
|
return report_data
|
|
@@ -233,15 +235,13 @@ def _deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_
|
|
|
233
235
|
try:
|
|
234
236
|
obj = joblib.load(true_filepath)
|
|
235
237
|
except (IOError, OSError, EOFError, TypeError, ValueError) as e:
|
|
236
|
-
|
|
238
|
+
_LOGGER.error(f"Failed to deserialize object from '{true_filepath}'.")
|
|
237
239
|
if raise_on_error:
|
|
238
|
-
raise
|
|
239
|
-
else:
|
|
240
|
-
print(message)
|
|
240
|
+
raise e
|
|
241
241
|
return None
|
|
242
242
|
else:
|
|
243
243
|
if verbose:
|
|
244
|
-
|
|
244
|
+
_LOGGER.info(f"Loaded object of type '{type(obj)}'")
|
|
245
245
|
return obj
|
|
246
246
|
|
|
247
247
|
|
ml_tools/ensemble_learning.py
CHANGED
|
@@ -339,7 +339,8 @@ def _resample(X_train: np.ndarray, y_train: pd.Series,
|
|
|
339
339
|
elif strategy == 'ADASYN':
|
|
340
340
|
resample_algorithm = ADASYN(random_state=random_state, n_neighbors=3)
|
|
341
341
|
else:
|
|
342
|
-
|
|
342
|
+
_LOGGER.error(f"Invalid resampling strategy: {strategy}")
|
|
343
|
+
raise ValueError()
|
|
343
344
|
|
|
344
345
|
X_res, y_res, *_ = resample_algorithm.fit_resample(X_train, y_train)
|
|
345
346
|
return X_res, y_res
|
|
@@ -459,7 +460,8 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
459
460
|
y_pred = evaluate_model_regression(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
460
461
|
x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
|
|
461
462
|
else:
|
|
462
|
-
|
|
463
|
+
_LOGGER.error(f"Unrecognized task '{task}' for model training,")
|
|
464
|
+
raise ValueError()
|
|
463
465
|
if debug:
|
|
464
466
|
_LOGGER.info(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
|
|
465
467
|
|
|
@@ -487,13 +489,14 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
|
|
|
487
489
|
elif isinstance(model_object, ClassificationTreeModels):
|
|
488
490
|
task = "classification"
|
|
489
491
|
if handle_classification_imbalance is None:
|
|
490
|
-
_LOGGER.warning("
|
|
492
|
+
_LOGGER.warning("No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
|
|
491
493
|
elif handle_classification_imbalance == "by_model":
|
|
492
494
|
model_object.use_model_balance = True
|
|
493
495
|
else:
|
|
494
496
|
model_object.use_model_balance = False
|
|
495
497
|
else:
|
|
496
|
-
|
|
498
|
+
_LOGGER.error(f"Unrecognized model {type(model_object)}")
|
|
499
|
+
raise TypeError()
|
|
497
500
|
|
|
498
501
|
#Check paths
|
|
499
502
|
datasets_path = make_fullpath(datasets_dir)
|
|
@@ -519,7 +522,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
|
|
|
519
522
|
debug=debug, save_dir=save_path, save_model=save_model,
|
|
520
523
|
generate_learning_curves=generate_learning_curves)
|
|
521
524
|
|
|
522
|
-
_LOGGER.info("
|
|
525
|
+
_LOGGER.info("Training and evaluation complete.")
|
|
523
526
|
|
|
524
527
|
|
|
525
528
|
def info():
|
ml_tools/handle_excel.py
CHANGED
|
@@ -37,7 +37,8 @@ def find_excel_files(
|
|
|
37
37
|
input_path = make_fullpath(directory)
|
|
38
38
|
|
|
39
39
|
if not input_path.is_dir():
|
|
40
|
-
|
|
40
|
+
_LOGGER.error(f"Directory not found: {input_path}")
|
|
41
|
+
raise NotADirectoryError()
|
|
41
42
|
|
|
42
43
|
excel_files = [
|
|
43
44
|
f for f in input_path.iterdir()
|
|
@@ -47,7 +48,8 @@ def find_excel_files(
|
|
|
47
48
|
]
|
|
48
49
|
|
|
49
50
|
if not excel_files:
|
|
50
|
-
|
|
51
|
+
_LOGGER.error(f"No valid Excel files found in directory: {input_path}")
|
|
52
|
+
raise FileNotFoundError()
|
|
51
53
|
|
|
52
54
|
return excel_files
|
|
53
55
|
|
|
@@ -99,7 +101,7 @@ def unmerge_and_split_excel(filepath: Union[str,Path]) -> None:
|
|
|
99
101
|
|
|
100
102
|
total_output_files += 1
|
|
101
103
|
|
|
102
|
-
_LOGGER.info(f"
|
|
104
|
+
_LOGGER.info(f"Processed file: {file_path} into {total_output_files} output file(s).")
|
|
103
105
|
return None
|
|
104
106
|
|
|
105
107
|
|
|
@@ -155,7 +157,7 @@ def unmerge_and_split_from_directory(input_dir: Union[str,Path], output_dir: Uni
|
|
|
155
157
|
|
|
156
158
|
total_output_files += 1
|
|
157
159
|
|
|
158
|
-
_LOGGER.info(f"
|
|
160
|
+
_LOGGER.info(f"Processed {len(excel_files)} input Excel file(s) with a total of {total_output_files} output Excel file(s).")
|
|
159
161
|
return None
|
|
160
162
|
|
|
161
163
|
|
|
@@ -199,13 +201,13 @@ def validate_excel_schema(
|
|
|
199
201
|
invalid_files.append(file)
|
|
200
202
|
|
|
201
203
|
except Exception as e:
|
|
202
|
-
_LOGGER.error(f"
|
|
204
|
+
_LOGGER.error(f"Error processing '{file}': {e}")
|
|
203
205
|
invalid_files.append(file)
|
|
204
206
|
|
|
205
207
|
valid_excel_number = len(excel_paths) - len(invalid_files)
|
|
206
208
|
_LOGGER.info(f"{valid_excel_number} out of {len(excel_paths)} excel files conform to the schema.")
|
|
207
209
|
if invalid_files:
|
|
208
|
-
_LOGGER.warning(f"
|
|
210
|
+
_LOGGER.warning(f"{len(invalid_files)} excel files are invalid:")
|
|
209
211
|
for in_file in invalid_files:
|
|
210
212
|
print(f" - {in_file.name}")
|
|
211
213
|
|
|
@@ -252,7 +254,8 @@ def vertical_merge_transform_excel(
|
|
|
252
254
|
if target_columns is not None:
|
|
253
255
|
missing = [col for col in target_columns if col not in df.columns]
|
|
254
256
|
if missing:
|
|
255
|
-
|
|
257
|
+
_LOGGER.error(f"Invalid columns in {file.name}: {missing}")
|
|
258
|
+
raise ValueError()
|
|
256
259
|
df = df[target_columns]
|
|
257
260
|
|
|
258
261
|
dataframes.append(df)
|
|
@@ -262,11 +265,12 @@ def vertical_merge_transform_excel(
|
|
|
262
265
|
if rename_columns is not None:
|
|
263
266
|
expected_len = len(target_columns if target_columns is not None else merged_df.columns)
|
|
264
267
|
if len(rename_columns) != expected_len:
|
|
265
|
-
|
|
268
|
+
_LOGGER.error("Length of 'rename_columns' must match the selected columns")
|
|
269
|
+
raise ValueError()
|
|
266
270
|
merged_df.columns = rename_columns
|
|
267
271
|
|
|
268
272
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
269
|
-
_LOGGER.info(f"
|
|
273
|
+
_LOGGER.info(f"Merged {len(dataframes)} excel files into '{csv_filename}'.")
|
|
270
274
|
|
|
271
275
|
|
|
272
276
|
def horizontal_merge_transform_excel(
|
|
@@ -327,7 +331,7 @@ def horizontal_merge_transform_excel(
|
|
|
327
331
|
duplicate_columns = merged_df.columns[merged_df.columns.duplicated()].tolist()
|
|
328
332
|
|
|
329
333
|
if duplicate_columns:
|
|
330
|
-
_LOGGER.warning(f"
|
|
334
|
+
_LOGGER.warning(f"Duplicate columns: {duplicate_columns}")
|
|
331
335
|
|
|
332
336
|
if skip_duplicates:
|
|
333
337
|
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
|
|
@@ -347,7 +351,7 @@ def horizontal_merge_transform_excel(
|
|
|
347
351
|
|
|
348
352
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
349
353
|
|
|
350
|
-
_LOGGER.info(f"
|
|
354
|
+
_LOGGER.info(f"Merged {len(excel_files)} Excel files into '{csv_filename}'.")
|
|
351
355
|
|
|
352
356
|
|
|
353
357
|
def info():
|
ml_tools/optimization_tools.py
CHANGED
|
@@ -61,7 +61,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
|
|
|
61
61
|
|
|
62
62
|
long_df = pd.concat(data_to_plot, ignore_index=True)
|
|
63
63
|
features = long_df['feature'].unique()
|
|
64
|
-
_LOGGER.info(f"
|
|
64
|
+
_LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
|
|
65
65
|
|
|
66
66
|
# --- Plotting Loop ---
|
|
67
67
|
for feature_name in features:
|
|
@@ -105,7 +105,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
|
|
|
105
105
|
plt.savefig(plot_filename, bbox_inches='tight')
|
|
106
106
|
plt.close()
|
|
107
107
|
|
|
108
|
-
_LOGGER.info(f"
|
|
108
|
+
_LOGGER.info(f"All plots saved successfully to: '{output_path}'")
|
|
109
109
|
|
|
110
110
|
|
|
111
111
|
def _save_result(
|
|
@@ -129,8 +129,7 @@ def _save_result(
|
|
|
129
129
|
if db_manager and db_table_name:
|
|
130
130
|
db_manager.insert_row(db_table_name, result_dict)
|
|
131
131
|
else:
|
|
132
|
-
_LOGGER.warning("
|
|
133
|
-
|
|
132
|
+
_LOGGER.warning("SQLite saving requested but db_manager or table_name not provided.")
|
|
134
133
|
|
|
135
134
|
|
|
136
135
|
def info():
|
ml_tools/path_manager.py
CHANGED
|
@@ -88,7 +88,7 @@ class PathManager:
|
|
|
88
88
|
try:
|
|
89
89
|
return self._paths[key]
|
|
90
90
|
except KeyError:
|
|
91
|
-
_LOGGER.error(f"
|
|
91
|
+
_LOGGER.error(f"Path key '{key}' not found.")
|
|
92
92
|
raise
|
|
93
93
|
|
|
94
94
|
def update(self, new_paths: Dict[str, Union[str, Path]], overwrite: bool = False) -> None:
|
|
@@ -106,9 +106,8 @@ class PathManager:
|
|
|
106
106
|
if not overwrite:
|
|
107
107
|
for key in new_paths:
|
|
108
108
|
if key in self._paths:
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
)
|
|
109
|
+
_LOGGER.error(f"Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True.")
|
|
110
|
+
raise KeyError
|
|
112
111
|
|
|
113
112
|
# Resolve any string paths to Path objects before storing
|
|
114
113
|
resolved_new_paths = {k: Path(v) for k, v in new_paths.items()}
|
|
@@ -136,7 +135,7 @@ class PathManager:
|
|
|
136
135
|
if key in self._paths:
|
|
137
136
|
path_items.append((key, self._paths[key]))
|
|
138
137
|
elif verbose:
|
|
139
|
-
_LOGGER.warning(f"
|
|
138
|
+
_LOGGER.warning(f"Key '{key}' not found in PathManager, skipping.")
|
|
140
139
|
else:
|
|
141
140
|
path_items = self._paths.items()
|
|
142
141
|
|
|
@@ -153,7 +152,7 @@ class PathManager:
|
|
|
153
152
|
|
|
154
153
|
if self._is_bundled and is_internal_path:
|
|
155
154
|
if verbose:
|
|
156
|
-
_LOGGER.warning(f"
|
|
155
|
+
_LOGGER.warning(f"Skipping internal directory '{key}' in bundled app (read-only).")
|
|
157
156
|
continue
|
|
158
157
|
# -------------------------
|
|
159
158
|
|
|
@@ -261,7 +260,8 @@ def make_fullpath(
|
|
|
261
260
|
resolved = path.resolve(strict=True)
|
|
262
261
|
except FileNotFoundError:
|
|
263
262
|
if not make:
|
|
264
|
-
|
|
263
|
+
_LOGGER.error(f"Path does not exist: '{path}'.")
|
|
264
|
+
raise FileNotFoundError()
|
|
265
265
|
|
|
266
266
|
try:
|
|
267
267
|
if is_file:
|
|
@@ -271,14 +271,17 @@ def make_fullpath(
|
|
|
271
271
|
else:
|
|
272
272
|
path.mkdir(parents=True, exist_ok=True)
|
|
273
273
|
resolved = path.resolve(strict=True)
|
|
274
|
-
except Exception
|
|
275
|
-
|
|
274
|
+
except Exception:
|
|
275
|
+
_LOGGER.exception(f"Failed to create {'file' if is_file else 'directory'} '{path}'.")
|
|
276
|
+
raise IOError()
|
|
276
277
|
|
|
277
278
|
if enforce == "file" and not resolved.is_file():
|
|
278
|
-
|
|
279
|
+
_LOGGER.error(f"Path was enforced as a file, but it is not: '{resolved}'")
|
|
280
|
+
raise TypeError()
|
|
279
281
|
|
|
280
282
|
if enforce == "directory" and not resolved.is_dir():
|
|
281
|
-
|
|
283
|
+
_LOGGER.error(f"Path was enforced as a directory, but it is not: '{resolved}'")
|
|
284
|
+
raise TypeError()
|
|
282
285
|
|
|
283
286
|
if verbose:
|
|
284
287
|
if resolved.is_file():
|
|
@@ -315,7 +318,8 @@ def sanitize_filename(filename: str) -> str:
|
|
|
315
318
|
|
|
316
319
|
# Check for empty string after sanitization
|
|
317
320
|
if not sanitized:
|
|
318
|
-
|
|
321
|
+
_LOGGER.error("The sanitized filename is empty. The original input may have contained only invalid characters.")
|
|
322
|
+
raise ValueError()
|
|
319
323
|
|
|
320
324
|
return sanitized
|
|
321
325
|
|
|
@@ -334,7 +338,8 @@ def list_csv_paths(directory: Union[str,Path], verbose: bool=True) -> dict[str,
|
|
|
334
338
|
|
|
335
339
|
csv_paths = list(dir_path.glob("*.csv"))
|
|
336
340
|
if not csv_paths:
|
|
337
|
-
|
|
341
|
+
_LOGGER.error(f"No CSV files found in directory: {dir_path.name}")
|
|
342
|
+
raise IOError()
|
|
338
343
|
|
|
339
344
|
# make a dictionary of paths and names
|
|
340
345
|
name_path_dict = {p.stem: p for p in csv_paths}
|
|
@@ -367,12 +372,13 @@ def list_files_by_extension(directory: Union[str,Path], extension: str, verbose:
|
|
|
367
372
|
|
|
368
373
|
matched_paths = list(dir_path.glob(pattern))
|
|
369
374
|
if not matched_paths:
|
|
370
|
-
|
|
375
|
+
_LOGGER.error(f"No '.{normalized_ext}' files found in directory: {dir_path}.")
|
|
376
|
+
raise IOError()
|
|
371
377
|
|
|
372
378
|
name_path_dict = {p.stem: p for p in matched_paths}
|
|
373
379
|
|
|
374
380
|
if verbose:
|
|
375
|
-
_LOGGER.info(f"
|
|
381
|
+
_LOGGER.info(f"📂 '{normalized_ext.upper()}' files found:")
|
|
376
382
|
for name in name_path_dict:
|
|
377
383
|
print(f"\t{name}")
|
|
378
384
|
|
ml_tools/utilities.py
CHANGED
|
@@ -76,11 +76,13 @@ def load_dataframe(
|
|
|
76
76
|
df = pl.read_csv(path, infer_schema_length=1000)
|
|
77
77
|
|
|
78
78
|
else:
|
|
79
|
-
|
|
79
|
+
_LOGGER.error(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
|
|
80
|
+
raise ValueError()
|
|
80
81
|
|
|
81
82
|
# This check works for both pandas and polars DataFrames
|
|
82
83
|
if df.shape[0] == 0:
|
|
83
|
-
|
|
84
|
+
_LOGGER.error(f"DataFrame '{df_name}' loaded from '{path}' is empty.")
|
|
85
|
+
raise ValueError()
|
|
84
86
|
|
|
85
87
|
if verbose:
|
|
86
88
|
_LOGGER.info(f"💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
|
|
@@ -162,13 +164,14 @@ def merge_dataframes(
|
|
|
162
164
|
merged_df = pd.concat(dfs, axis=0)
|
|
163
165
|
|
|
164
166
|
else:
|
|
165
|
-
|
|
167
|
+
_LOGGER.error(f"Invalid merge direction: {direction}")
|
|
168
|
+
raise ValueError()
|
|
166
169
|
|
|
167
170
|
if reset_index:
|
|
168
171
|
merged_df = merged_df.reset_index(drop=True)
|
|
169
172
|
|
|
170
173
|
if verbose:
|
|
171
|
-
_LOGGER.info(f"
|
|
174
|
+
_LOGGER.info(f"Merged DataFrame shape: {merged_df.shape}")
|
|
172
175
|
|
|
173
176
|
return merged_df
|
|
174
177
|
|
|
@@ -187,7 +190,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
|
|
|
187
190
|
"""
|
|
188
191
|
# This check works for both pandas and polars
|
|
189
192
|
if df.shape[0] == 0:
|
|
190
|
-
_LOGGER.warning(f"
|
|
193
|
+
_LOGGER.warning(f"Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
|
|
191
194
|
return
|
|
192
195
|
|
|
193
196
|
# Create the directory if it doesn't exist
|
|
@@ -207,9 +210,10 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
|
|
|
207
210
|
df.write_csv(output_path) # Polars defaults to utf8 and no index
|
|
208
211
|
else:
|
|
209
212
|
# This error handles cases where an unsupported type is passed
|
|
210
|
-
|
|
213
|
+
_LOGGER.error(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
|
|
214
|
+
raise TypeError()
|
|
211
215
|
|
|
212
|
-
_LOGGER.info(f"
|
|
216
|
+
_LOGGER.info(f"Saved dataset: '{filename}' with shape: {df.shape}")
|
|
213
217
|
|
|
214
218
|
|
|
215
219
|
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
@@ -243,7 +247,8 @@ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
|
243
247
|
|
|
244
248
|
# Raise for negative values
|
|
245
249
|
if any(x < 0 for x in float_list):
|
|
246
|
-
|
|
250
|
+
_LOGGER.error("Negative values are not allowed in the input list.")
|
|
251
|
+
raise ValueError()
|
|
247
252
|
|
|
248
253
|
# Step 2: Compute log10 of non-zero values
|
|
249
254
|
nonzero = [x for x in float_list if x > 0]
|
|
@@ -302,14 +307,16 @@ def threshold_binary_values(
|
|
|
302
307
|
elif isinstance(input_array, (list, tuple)):
|
|
303
308
|
array = np.array(input_array)
|
|
304
309
|
else:
|
|
305
|
-
|
|
310
|
+
_LOGGER.error("Unsupported input type")
|
|
311
|
+
raise TypeError()
|
|
306
312
|
|
|
307
313
|
array = array.flatten()
|
|
308
314
|
total = array.shape[0]
|
|
309
315
|
|
|
310
316
|
bin_count = total if binary_values is None else binary_values
|
|
311
317
|
if not (0 <= bin_count <= total):
|
|
312
|
-
|
|
318
|
+
_LOGGER.error("'binary_values' must be between 0 and the total number of elements")
|
|
319
|
+
raise ValueError()
|
|
313
320
|
|
|
314
321
|
if bin_count == 0:
|
|
315
322
|
result = array
|
|
@@ -349,9 +356,15 @@ def threshold_binary_values_batch(
|
|
|
349
356
|
np.ndarray
|
|
350
357
|
Thresholded array, same shape as input.
|
|
351
358
|
"""
|
|
352
|
-
|
|
359
|
+
if input_array.ndim != 2:
|
|
360
|
+
_LOGGER.error(f"Expected 2D array, got {input_array.ndim}D array.")
|
|
361
|
+
raise AssertionError()
|
|
362
|
+
|
|
353
363
|
batch_size, total_features = input_array.shape
|
|
354
|
-
|
|
364
|
+
|
|
365
|
+
if not (0 <= binary_values <= total_features):
|
|
366
|
+
_LOGGER.error("'binary_values' out of valid range.")
|
|
367
|
+
raise AssertionError()
|
|
355
368
|
|
|
356
369
|
if binary_values == 0:
|
|
357
370
|
return input_array.copy()
|
|
@@ -380,15 +393,13 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
|
|
|
380
393
|
full_path = save_path / sanitized_name
|
|
381
394
|
joblib.dump(obj, full_path)
|
|
382
395
|
except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
|
|
383
|
-
|
|
396
|
+
_LOGGER.error(f"Failed to serialize object of type '{type(obj)}'.")
|
|
384
397
|
if raise_on_error:
|
|
385
|
-
raise
|
|
386
|
-
else:
|
|
387
|
-
_LOGGER.warning(message)
|
|
398
|
+
raise e
|
|
388
399
|
return None
|
|
389
400
|
else:
|
|
390
401
|
if verbose:
|
|
391
|
-
_LOGGER.info(f"
|
|
402
|
+
_LOGGER.info(f"Object of type '{type(obj)}' saved to '{full_path}'")
|
|
392
403
|
return None
|
|
393
404
|
|
|
394
405
|
|
|
@@ -407,15 +418,13 @@ def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_e
|
|
|
407
418
|
try:
|
|
408
419
|
obj = joblib.load(true_filepath)
|
|
409
420
|
except (IOError, OSError, EOFError, TypeError, ValueError) as e:
|
|
410
|
-
|
|
421
|
+
_LOGGER.error(f"Failed to deserialize object from '{true_filepath}'.")
|
|
411
422
|
if raise_on_error:
|
|
412
|
-
raise
|
|
413
|
-
else:
|
|
414
|
-
_LOGGER.warning(message)
|
|
423
|
+
raise e
|
|
415
424
|
return None
|
|
416
425
|
else:
|
|
417
426
|
if verbose:
|
|
418
|
-
_LOGGER.info(f"
|
|
427
|
+
_LOGGER.info(f"Loaded object of type '{type(obj)}'.")
|
|
419
428
|
return obj
|
|
420
429
|
|
|
421
430
|
|
|
@@ -486,7 +495,8 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
|
|
|
486
495
|
for dir in list_of_dirs:
|
|
487
496
|
dir_path = make_fullpath(dir)
|
|
488
497
|
if not dir_path.is_dir():
|
|
489
|
-
|
|
498
|
+
_LOGGER.error(f"'{dir}' is not a directory.")
|
|
499
|
+
raise IOError()
|
|
490
500
|
all_dir_paths.append(dir_path)
|
|
491
501
|
|
|
492
502
|
# main loop
|
|
@@ -502,10 +512,10 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
|
|
|
502
512
|
save_dataframe(df=df, save_dir=save_dir, filename=filename)
|
|
503
513
|
total_saved += 1
|
|
504
514
|
except Exception as e:
|
|
505
|
-
_LOGGER.
|
|
515
|
+
_LOGGER.error(f"Failed to process file '{df_path}'. Reason: {e}")
|
|
506
516
|
continue
|
|
507
517
|
|
|
508
|
-
_LOGGER.info(f"
|
|
518
|
+
_LOGGER.info(f"{total_saved} single-target datasets were created.")
|
|
509
519
|
|
|
510
520
|
|
|
511
521
|
def train_dataset_yielder(
|
|
@@ -530,6 +540,5 @@ def train_dataset_yielder(
|
|
|
530
540
|
yield (df_features, df_target, feature_names, target_col)
|
|
531
541
|
|
|
532
542
|
|
|
533
|
-
|
|
534
543
|
def info():
|
|
535
544
|
_script_info(__all__)
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
dragon_ml_toolbox-8.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
-
dragon_ml_toolbox-8.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
|
-
ml_tools/ETL_engineering.py,sha256=69YGK4fN5ouRBknTvU4uZ8KLQGT-hPrvwymH-IygEnk,40911
|
|
4
|
-
ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
|
|
5
|
-
ml_tools/MICE_imputation.py,sha256=oFHg-OytOzPYTzBR_wIRHhP71cMn3aupDeT59ABsXlQ,11576
|
|
6
|
-
ml_tools/ML_callbacks.py,sha256=noedVMmHZ72Odbg28zqx5wkhhvX2v-jXicKE_NCAiqU,13838
|
|
7
|
-
ml_tools/ML_datasetmaster.py,sha256=tN-GBPEwXRWFBT8r8K0v9b3Bd77DhqSH5FkjDP6BHTw,28847
|
|
8
|
-
ml_tools/ML_evaluation.py,sha256=BER5dOvSTySNzO92gm8tIpqJ5vT-s0iHMmaoly1uUH8,16018
|
|
9
|
-
ml_tools/ML_evaluation_multi.py,sha256=uVtKGYWgOLv34Xj_jz6E_HAYzNb0HwRbMwA8oFZWpUk,12395
|
|
10
|
-
ml_tools/ML_inference.py,sha256=hwtAdyDCE1xtqLgJgyOTAPck0eTmkOCJK1cM_IJSdck,22824
|
|
11
|
-
ml_tools/ML_models.py,sha256=xZiSFh7S6eitl-VjjvNpsikojDvurK8n_ueLEh6_5pM,27979
|
|
12
|
-
ml_tools/ML_optimization.py,sha256=GX-qZ2mCI3gWRCTP5w7lXrZpfGle3J_mE0O68seIoio,13475
|
|
13
|
-
ml_tools/ML_scaler.py,sha256=pGkp1nUpeuoBvbq5hUkieQdxex6kNef1mEbeS_HUCJs,7471
|
|
14
|
-
ml_tools/ML_trainer.py,sha256=6JSmEQaCPSo-S_5plNBTPw-SYgzZpyMNwiqpShJf7qU,23726
|
|
15
|
-
ml_tools/PSO_optimization.py,sha256=9Y074d-B5h4Wvp9YPiy6KAeXM-Yv6Il3gWalKvOLVgo,22705
|
|
16
|
-
ml_tools/RNN_forecast.py,sha256=2CyjBLSYYc3xLHxwLXUmP5Qv8AmV1OB_EndETNX1IBk,1956
|
|
17
|
-
ml_tools/SQL.py,sha256=bkSTmMV4CtEqa67hApYWaRxTqwAlKIc5_b28P1bnDwg,10475
|
|
18
|
-
ml_tools/VIF_factor.py,sha256=2nUMupfUoogf8o6ghoFZk_OwWhFXU0R3C9Gj0HOlI14,10415
|
|
19
|
-
ml_tools/_ML_optimization_multi.py,sha256=DrNG3Vf1uUw-3CpYfXREgSGuR4dTpLWY1F3R9j-PYqQ,9816
|
|
20
|
-
ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
|
|
21
|
-
ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
|
|
22
|
-
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
23
|
-
ml_tools/custom_logger.py,sha256=nyLRxaRxkqYOFdSjI0X2BWXB8C2IU18QfmqIFKqSedI,5820
|
|
24
|
-
ml_tools/data_exploration.py,sha256=RuMHWagXrSQi1MzAMlYeBeVg7UxhVvEq8gJ9bIam2BM,27103
|
|
25
|
-
ml_tools/ensemble_evaluation.py,sha256=wnqoTPg4WYWf2A8z5XT0eSlW4snEuLCXQVj88sZKzQ4,24683
|
|
26
|
-
ml_tools/ensemble_inference.py,sha256=rtU7eUaQne615n2g7IHZCJI-OvrBCcjxbTkEIvtCGFQ,9414
|
|
27
|
-
ml_tools/ensemble_learning.py,sha256=dAyFgSTyvxJWjc_enJ_8EUoWwiekBeoNyJNxVY-kcUU,21868
|
|
28
|
-
ml_tools/handle_excel.py,sha256=J9iwIqMZemoxK49J5osSwp9Ge0h9YTKyYGbOm53hcno,13007
|
|
29
|
-
ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
|
|
30
|
-
ml_tools/optimization_tools.py,sha256=EL5tgNFwRo-82pbRE1CFVy9noNhULD7wprWuKadPheg,5090
|
|
31
|
-
ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
|
|
32
|
-
ml_tools/utilities.py,sha256=LqXXTovaHbA5AOKRk6Ru6DgAPAM0wPfYU70kUjYBryo,19231
|
|
33
|
-
dragon_ml_toolbox-8.2.0.dist-info/METADATA,sha256=C1rjTnTNSj6VI2khy7Xl1VjQ__MP6-b43x9RIQCHY3E,6778
|
|
34
|
-
dragon_ml_toolbox-8.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
-
dragon_ml_toolbox-8.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
36
|
-
dragon_ml_toolbox-8.2.0.dist-info/RECORD,,
|