dragon-ml-toolbox 3.11.0__tar.gz → 3.12.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-3.11.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.12.1}/PKG-INFO +1 -1
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/GUI_tools.py +147 -17
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/MICE_imputation.py +6 -6
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/PSO_optimization.py +2 -4
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/VIF_factor.py +15 -13
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/data_exploration.py +1 -1
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/handle_excel.py +9 -8
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/path_manager.py +1 -1
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/utilities.py +73 -41
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/pyproject.toml +1 -1
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/LICENSE +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/README.md +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ETL_engineering.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ML_tutorial.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/_pytorch_models.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/datasetmaster.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/logger.py +0 -0
- {dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/setup.cfg +0 -0
|
@@ -3,7 +3,7 @@ from pathlib import Path
|
|
|
3
3
|
import traceback
|
|
4
4
|
import FreeSimpleGUI as sg
|
|
5
5
|
from functools import wraps
|
|
6
|
-
from typing import Any, Dict, Tuple, List, Literal, Union,
|
|
6
|
+
from typing import Any, Dict, Tuple, List, Literal, Union, Optional, Callable
|
|
7
7
|
from .utilities import _script_info
|
|
8
8
|
import numpy as np
|
|
9
9
|
from .logger import _LOGGER
|
|
@@ -104,11 +104,13 @@ class ConfigManager:
|
|
|
104
104
|
'max_size': ''
|
|
105
105
|
}
|
|
106
106
|
config['Layout'] = {
|
|
107
|
-
'; Default size for continuous input boxes (width,height in characters).': '',
|
|
107
|
+
'; Default size for continuous input boxes (width,height in characters/rows).': '',
|
|
108
108
|
'input_size_cont': '16,1',
|
|
109
|
-
'; Default size for combo/binary boxes (width,height in characters).': '',
|
|
109
|
+
'; Default size for combo/binary boxes (width,height in characters/rows).': '',
|
|
110
110
|
'input_size_binary': '14,1',
|
|
111
|
-
';
|
|
111
|
+
'; Size for multiselect listboxes (width,height in characters/rows).': '',
|
|
112
|
+
'input_size_multi': '14,4',
|
|
113
|
+
'; Default size for buttons (width,height in characters/rows).': '',
|
|
112
114
|
'button_size': '15,2'
|
|
113
115
|
}
|
|
114
116
|
config['Fonts'] = {
|
|
@@ -303,6 +305,57 @@ class GUIFactory:
|
|
|
303
305
|
|
|
304
306
|
# Default to 'grid' layout
|
|
305
307
|
return [columns[i:i + features_per_column] for i in range(0, len(columns), features_per_column)]
|
|
308
|
+
|
|
309
|
+
def generate_multiselect_layout(
|
|
310
|
+
self,
|
|
311
|
+
data_dict: Dict[str, Union[List[Any], Tuple[Any, ...]]],
|
|
312
|
+
layout_mode: Literal["grid", "row"] = 'grid',
|
|
313
|
+
features_per_column: int = 4
|
|
314
|
+
) -> List[List[sg.Column]]:
|
|
315
|
+
"""
|
|
316
|
+
Generates a layout for features using Listbox elements for multiple selections.
|
|
317
|
+
|
|
318
|
+
This allows the user to select zero or more options from a list without
|
|
319
|
+
being able to input custom text.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
data_dict (dict): Keys are feature names, values are lists of options.
|
|
323
|
+
layout_mode (str): 'grid' for a multi-row grid layout, or 'row' for a single horizontal row.
|
|
324
|
+
features_per_column (int): Number of features per column when `layout_mode` is 'grid'.
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
A list of lists of sg.Column elements, ready to be used in a window layout.
|
|
328
|
+
"""
|
|
329
|
+
cfg = self.config
|
|
330
|
+
bg_color = sg.theme_background_color()
|
|
331
|
+
label_font = (cfg.fonts.font_family, cfg.fonts.label_size, cfg.fonts.label_style) # type: ignore
|
|
332
|
+
|
|
333
|
+
columns = []
|
|
334
|
+
for name, values in data_dict.items():
|
|
335
|
+
label = sg.Text(name, font=label_font, background_color=bg_color, key=f"_text_{name}")
|
|
336
|
+
|
|
337
|
+
# Use sg.Listbox for multiple selections.
|
|
338
|
+
element = sg.Listbox(
|
|
339
|
+
values,
|
|
340
|
+
key=name,
|
|
341
|
+
select_mode=sg.LISTBOX_SELECT_MODE_MULTIPLE,
|
|
342
|
+
size=cfg.layout.input_size_multi, # type: ignore
|
|
343
|
+
no_scrollbar=False
|
|
344
|
+
)
|
|
345
|
+
# -------------------
|
|
346
|
+
|
|
347
|
+
layout = [[label], [element]]
|
|
348
|
+
# Add a small spacer for consistent vertical alignment.
|
|
349
|
+
layout.append([sg.Text(" ", font=(cfg.fonts.font_family, 2), background_color=bg_color)]) # type: ignore
|
|
350
|
+
|
|
351
|
+
# Each feature is wrapped in a Column element for proper alignment.
|
|
352
|
+
columns.append(sg.Column(layout, background_color=bg_color))
|
|
353
|
+
|
|
354
|
+
if layout_mode == 'row':
|
|
355
|
+
return [columns] # A single row containing all columns
|
|
356
|
+
|
|
357
|
+
# Default to 'grid' layout
|
|
358
|
+
return [columns[i:i + features_per_column] for i in range(0, len(columns), features_per_column)]
|
|
306
359
|
|
|
307
360
|
# --- Window Creation ---
|
|
308
361
|
def create_window(self, title: str, layout: List[List[sg.Element]], **kwargs) -> sg.Window:
|
|
@@ -384,6 +437,7 @@ class FeatureMaster:
|
|
|
384
437
|
targets: Dict[str, str],
|
|
385
438
|
continuous_features: Optional[Dict[str, Tuple[str, float, float]]] = None,
|
|
386
439
|
binary_features: Optional[Dict[str, str]] = None,
|
|
440
|
+
multi_binary_features: Optional[Dict[str, Dict[str, str]]] = None,
|
|
387
441
|
one_hot_features: Optional[Dict[str, Dict[str, str]]] = None,
|
|
388
442
|
categorical_features: Optional[List[Tuple[str, str, Dict[str, int]]]] = None) -> None:
|
|
389
443
|
"""
|
|
@@ -410,6 +464,14 @@ class FeatureMaster:
|
|
|
410
464
|
A dictionary for binary (True/False) features.
|
|
411
465
|
- **key** (str): The name to be displayed in the GUI (e.g., for a checkbox).
|
|
412
466
|
- **value** (str): The model's internal feature name.
|
|
467
|
+
|
|
468
|
+
multi_binary_features (Dict[str, Dict[str, str]]):
|
|
469
|
+
A dictionary for features where multiple binary-like options can be
|
|
470
|
+
selected at once (e.g., from a multi-select listbox).
|
|
471
|
+
- **key** (str): The name for the group to be displayed in the GUI.
|
|
472
|
+
- **value** (Dict[str, str]): A nested dictionary where:
|
|
473
|
+
- key (str): The user-selectable option.
|
|
474
|
+
- value (str): The corresponding model's internal feature name.
|
|
413
475
|
|
|
414
476
|
one_hot_features (Dict[str, Dict[str, str]]):
|
|
415
477
|
A dictionary for features that will be one-hot encoded from a single
|
|
@@ -418,8 +480,7 @@ class FeatureMaster:
|
|
|
418
480
|
for a dropdown menu).
|
|
419
481
|
- **value** (Dict[str, str]): A nested dictionary where:
|
|
420
482
|
- key (str): The user-selectable option (e.g., 'Category A').
|
|
421
|
-
- value (str): The corresponding model column name
|
|
422
|
-
set to 1.
|
|
483
|
+
- value (str): The corresponding model column name.
|
|
423
484
|
|
|
424
485
|
categorical_features (List[Tuple[str, str, Dict[str, int]]]):
|
|
425
486
|
A list for ordinal or label-encoded categorical features.
|
|
@@ -431,7 +492,7 @@ class FeatureMaster:
|
|
|
431
492
|
options to their corresponding integer values.
|
|
432
493
|
"""
|
|
433
494
|
# Validation
|
|
434
|
-
if continuous_features is None and binary_features is None and one_hot_features is None and categorical_features is None:
|
|
495
|
+
if continuous_features is None and binary_features is None and one_hot_features is None and categorical_features is None and multi_binary_features is None:
|
|
435
496
|
raise ValueError("No features provided.")
|
|
436
497
|
|
|
437
498
|
# Targets
|
|
@@ -454,6 +515,15 @@ class FeatureMaster:
|
|
|
454
515
|
else:
|
|
455
516
|
self._binary_values, self._binary_mapping = None, None
|
|
456
517
|
self.has_binary = False
|
|
518
|
+
|
|
519
|
+
# multi-binary features
|
|
520
|
+
if multi_binary_features is not None:
|
|
521
|
+
self._multi_binary_values = self._handle_multi_binary_features(multi_binary_features)
|
|
522
|
+
self._multi_binary_mapping = multi_binary_features
|
|
523
|
+
self.has_multi_binary = True
|
|
524
|
+
else:
|
|
525
|
+
self._multi_binary_values, self._multi_binary_mapping = None, None
|
|
526
|
+
self.has_multi_binary = False
|
|
457
527
|
|
|
458
528
|
# one-hot features
|
|
459
529
|
if one_hot_features is not None:
|
|
@@ -493,6 +563,14 @@ class FeatureMaster:
|
|
|
493
563
|
gui_values: dict[str, tuple[Literal["False"],Literal["True"]]] = {gui_key: ("False", "True") for gui_key in binary_features.keys()}
|
|
494
564
|
# Map GUI name to Model name (same as input)
|
|
495
565
|
return gui_values
|
|
566
|
+
|
|
567
|
+
def _handle_multi_binary_features(self, multi_binary_features: Dict[str, Dict[str, str]]):
|
|
568
|
+
# Make dictionary GUI name: range values
|
|
569
|
+
gui_values: dict[str, tuple[str,...]] = {
|
|
570
|
+
gui_key: tuple(nested_dict.keys())
|
|
571
|
+
for gui_key, nested_dict in multi_binary_features.items()}
|
|
572
|
+
# Map GUI name to Model name and preserve internal mapping (same as input)
|
|
573
|
+
return gui_values
|
|
496
574
|
|
|
497
575
|
def _handle_one_hot_features(self, one_hot_features: Dict[str, Dict[str,str]]):
|
|
498
576
|
# Make dictionary GUI name: range values
|
|
@@ -514,6 +592,8 @@ class FeatureMaster:
|
|
|
514
592
|
all_dict.update(self._continuous_mapping)
|
|
515
593
|
if self._binary_mapping is not None:
|
|
516
594
|
all_dict.update(self._binary_mapping)
|
|
595
|
+
if self._multi_binary_mapping is not None:
|
|
596
|
+
all_dict.update(self._multi_binary_mapping)
|
|
517
597
|
if self._one_hot_mapping is not None:
|
|
518
598
|
all_dict.update(self._one_hot_mapping)
|
|
519
599
|
if self._categorical_mapping is not None:
|
|
@@ -595,6 +675,28 @@ class FeatureMaster:
|
|
|
595
675
|
"""
|
|
596
676
|
if self._binary_values is not None:
|
|
597
677
|
return self._binary_values
|
|
678
|
+
|
|
679
|
+
@property
|
|
680
|
+
def multi_binary(self):
|
|
681
|
+
"""
|
|
682
|
+
The mapping for multi-binary features.
|
|
683
|
+
|
|
684
|
+
Structure:
|
|
685
|
+
{"GUI NAME": {"GUI OPTION 1": "model_column"}}
|
|
686
|
+
"""
|
|
687
|
+
if self._multi_binary_mapping is not None:
|
|
688
|
+
return self._multi_binary_mapping
|
|
689
|
+
|
|
690
|
+
@property
|
|
691
|
+
def multi_binary_gui(self):
|
|
692
|
+
"""
|
|
693
|
+
The GUI options for multi-binary feature groups.
|
|
694
|
+
|
|
695
|
+
Structure:
|
|
696
|
+
Dict[str, Tuple[str, ...]]
|
|
697
|
+
"""
|
|
698
|
+
if self._multi_binary_values is not None:
|
|
699
|
+
return self._multi_binary_values
|
|
598
700
|
|
|
599
701
|
@property
|
|
600
702
|
def one_hot(self):
|
|
@@ -697,7 +799,7 @@ class GUIHandler:
|
|
|
697
799
|
Maps GUI name to model expected name and casts the value to float.
|
|
698
800
|
"""
|
|
699
801
|
try:
|
|
700
|
-
model_name = self.master.continuous[gui_feature]
|
|
802
|
+
model_name = self.master.continuous[gui_feature] # type: ignore
|
|
701
803
|
float_value = float(chosen_value)
|
|
702
804
|
except KeyError as e:
|
|
703
805
|
_LOGGER.error(f"No matching name for '{gui_feature}' defined as continuous.")
|
|
@@ -713,8 +815,8 @@ class GUIHandler:
|
|
|
713
815
|
Maps GUI name to model expected name and casts the value to binary (0,1).
|
|
714
816
|
"""
|
|
715
817
|
try:
|
|
716
|
-
model_name = self.master.binary[gui_feature]
|
|
717
|
-
binary_mapping_keys = self.master.binary_gui[gui_feature]
|
|
818
|
+
model_name = self.master.binary[gui_feature] # type: ignore
|
|
819
|
+
binary_mapping_keys = self.master.binary_gui[gui_feature] # type: ignore
|
|
718
820
|
except KeyError as e:
|
|
719
821
|
_LOGGER.error(f"No matching name for '{gui_feature}' defined as binary.")
|
|
720
822
|
raise e
|
|
@@ -725,13 +827,36 @@ class GUIHandler:
|
|
|
725
827
|
}
|
|
726
828
|
result = mapping_dict[chosen_value]
|
|
727
829
|
return model_name, result
|
|
830
|
+
|
|
831
|
+
def _process_multi_binary(self, gui_feature: str, chosen_values: list[str]) -> dict[str, int]:
|
|
832
|
+
"""
|
|
833
|
+
Maps GUI names to model expected names and casts values to multi-binary encoding.
|
|
834
|
+
|
|
835
|
+
For a given feature group, this sets all selected options to 1 and all
|
|
836
|
+
unselected options to 0.
|
|
837
|
+
"""
|
|
838
|
+
try:
|
|
839
|
+
# Get the mapping for the group
|
|
840
|
+
multi_binary_mapping = self.master.multi_binary[gui_feature] # type: ignore
|
|
841
|
+
except KeyError as e:
|
|
842
|
+
_LOGGER.error(f"No matching name for '{gui_feature}' defined as multi-binary.")
|
|
843
|
+
raise e
|
|
844
|
+
else:
|
|
845
|
+
# Start with all possible features for this group set to 0 (unselected)
|
|
846
|
+
results = {model_key: 0 for model_key in multi_binary_mapping.values()}
|
|
847
|
+
# Set the features for the chosen options to 1
|
|
848
|
+
for chosen_option in chosen_values:
|
|
849
|
+
model_name = multi_binary_mapping[chosen_option]
|
|
850
|
+
results[model_name] = 1
|
|
851
|
+
|
|
852
|
+
return results
|
|
728
853
|
|
|
729
854
|
def _process_one_hot(self, gui_feature: str, chosen_value: str) -> Dict[str,int]:
|
|
730
855
|
"""
|
|
731
856
|
Maps GUI names to model expected names and casts values to one-hot encoding.
|
|
732
857
|
"""
|
|
733
858
|
try:
|
|
734
|
-
one_hot_mapping = self.master.one_hot[gui_feature]
|
|
859
|
+
one_hot_mapping = self.master.one_hot[gui_feature] # type: ignore
|
|
735
860
|
except KeyError as e:
|
|
736
861
|
_LOGGER.error(f"No matching name for '{gui_feature}' defined as one-hot.")
|
|
737
862
|
raise e
|
|
@@ -748,7 +873,7 @@ class GUIHandler:
|
|
|
748
873
|
Maps GUI name to model expected name and casts the value to a categorical number.
|
|
749
874
|
"""
|
|
750
875
|
try:
|
|
751
|
-
categorical_tuple = self.master.categorical[gui_feature]
|
|
876
|
+
categorical_tuple = self.master.categorical[gui_feature] # type: ignore
|
|
752
877
|
except KeyError as e:
|
|
753
878
|
_LOGGER.error(f"No matching name for '{gui_feature}' defined as categorical.")
|
|
754
879
|
raise e
|
|
@@ -804,25 +929,31 @@ class GUIHandler:
|
|
|
804
929
|
|
|
805
930
|
if self.master.has_continuous:
|
|
806
931
|
processed_subset = self._call_subprocess(window_values=window_values,
|
|
807
|
-
master_feature=self.master.continuous,
|
|
932
|
+
master_feature=self.master.continuous, # type: ignore
|
|
808
933
|
processor=self._process_continuous)
|
|
809
934
|
processed_features.update(processed_subset)
|
|
810
935
|
|
|
811
936
|
if self.master.has_binary:
|
|
812
937
|
processed_subset = self._call_subprocess(window_values=window_values,
|
|
813
|
-
master_feature=self.master.binary,
|
|
938
|
+
master_feature=self.master.binary, # type: ignore
|
|
814
939
|
processor=self._process_binary)
|
|
815
940
|
processed_features.update(processed_subset)
|
|
941
|
+
|
|
942
|
+
if self.master.has_multi_binary:
|
|
943
|
+
processed_subset = self._call_subprocess(window_values=window_values,
|
|
944
|
+
master_feature=self.master.multi_binary, # type: ignore
|
|
945
|
+
processor=self._process_multi_binary)
|
|
946
|
+
processed_features.update(processed_subset)
|
|
816
947
|
|
|
817
948
|
if self.master.has_one_hot:
|
|
818
949
|
processed_subset = self._call_subprocess(window_values=window_values,
|
|
819
|
-
master_feature=self.master.one_hot,
|
|
950
|
+
master_feature=self.master.one_hot, # type: ignore
|
|
820
951
|
processor=self._process_one_hot)
|
|
821
952
|
processed_features.update(processed_subset)
|
|
822
953
|
|
|
823
954
|
if self.master.has_categorical:
|
|
824
955
|
processed_subset = self._call_subprocess(window_values=window_values,
|
|
825
|
-
master_feature=self.master.categorical,
|
|
956
|
+
master_feature=self.master.categorical, # type: ignore
|
|
826
957
|
processor=self._process_categorical)
|
|
827
958
|
processed_features.update(processed_subset)
|
|
828
959
|
|
|
@@ -836,7 +967,6 @@ class GUIHandler:
|
|
|
836
967
|
raise RuntimeError(f"Configuration Error: Implemented methods failed to generate the required model feature: '{e}'")
|
|
837
968
|
|
|
838
969
|
return np.array(final_vector, dtype=np.float32)
|
|
839
|
-
|
|
840
970
|
|
|
841
971
|
def info():
|
|
842
972
|
_script_info(__all__)
|
|
@@ -35,7 +35,7 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
|
|
|
35
35
|
imputed_datasets = [kernel.complete_data(dataset=i) for i in range(resulting_datasets)]
|
|
36
36
|
|
|
37
37
|
if imputed_datasets is None or len(imputed_datasets) == 0:
|
|
38
|
-
raise ValueError("No imputed datasets were generated. Check the MICE process.")
|
|
38
|
+
raise ValueError("❌ No imputed datasets were generated. Check the MICE process.")
|
|
39
39
|
|
|
40
40
|
# threshold binary columns
|
|
41
41
|
if binary_columns is not None:
|
|
@@ -56,8 +56,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
|
|
|
56
56
|
|
|
57
57
|
# Ensure indexes match
|
|
58
58
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
59
|
-
assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}" # type: ignore
|
|
60
|
-
assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}" # type: ignore
|
|
59
|
+
assert imputed_df.shape[0] == df.shape[0], f"❌ Row count mismatch in dataset {subname}" # type: ignore
|
|
60
|
+
assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}" # type: ignore
|
|
61
61
|
# print("✅ All imputed datasets match the original DataFrame indexes.")
|
|
62
62
|
|
|
63
63
|
return kernel, imputed_datasets, imputed_dataset_names
|
|
@@ -90,7 +90,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
90
90
|
dataset_count = kernel.num_datasets
|
|
91
91
|
|
|
92
92
|
if dataset_count != len(imputed_dataset_names):
|
|
93
|
-
raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
|
|
93
|
+
raise ValueError(f"❌ Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
|
|
94
94
|
|
|
95
95
|
# Check path
|
|
96
96
|
root_path = make_fullpath(root_dir, make=True)
|
|
@@ -152,7 +152,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
152
152
|
"""Helper function to add labels and legends to a figure"""
|
|
153
153
|
|
|
154
154
|
if not isinstance(fig, ggplot):
|
|
155
|
-
raise TypeError("Expected a plotnine.ggplot object")
|
|
155
|
+
raise TypeError("❌ Expected a plotnine.ggplot object")
|
|
156
156
|
|
|
157
157
|
# Edit labels and title
|
|
158
158
|
fig = fig + theme(
|
|
@@ -166,7 +166,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
166
166
|
fig = fig.draw()
|
|
167
167
|
|
|
168
168
|
if not hasattr(fig, 'axes') or len(fig.axes) == 0:
|
|
169
|
-
raise RuntimeError("Rendered figure has no axes to modify")
|
|
169
|
+
raise RuntimeError("❌ Rendered figure has no axes to modify")
|
|
170
170
|
|
|
171
171
|
if filename == "Combined_Distributions":
|
|
172
172
|
custom_xlabel = "Feature Values"
|
|
@@ -530,10 +530,8 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
|
|
|
530
530
|
results_path = make_fullpath(results_dir)
|
|
531
531
|
output_path = make_fullpath(save_dir, make=True)
|
|
532
532
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
_LOGGER.warning("⚠️ No data found. No plots will be generated.")
|
|
536
|
-
return
|
|
533
|
+
# Check that the directory contains csv files
|
|
534
|
+
list_csv_paths(results_path, verbose=False)
|
|
537
535
|
|
|
538
536
|
# --- Data Loading and Preparation ---
|
|
539
537
|
_LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
|
|
@@ -26,8 +26,7 @@ def compute_vif(
|
|
|
26
26
|
save_dir: Optional[Union[str,Path]] = None,
|
|
27
27
|
filename: Optional[str] = None,
|
|
28
28
|
fontsize: int = 14,
|
|
29
|
-
show_plot: bool = True
|
|
30
|
-
verbose: bool = True
|
|
29
|
+
show_plot: bool = True
|
|
31
30
|
) -> pd.DataFrame:
|
|
32
31
|
"""
|
|
33
32
|
Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
|
|
@@ -54,21 +53,20 @@ def compute_vif(
|
|
|
54
53
|
if use_columns is None:
|
|
55
54
|
sanitized_columns = df.select_dtypes(include='number').columns.tolist()
|
|
56
55
|
missing_features = set(ground_truth_cols) - set(sanitized_columns)
|
|
57
|
-
if missing_features
|
|
56
|
+
if missing_features:
|
|
58
57
|
_LOGGER.warning(f"⚠️ These columns are not Numeric:\n{missing_features}")
|
|
59
58
|
else:
|
|
60
59
|
sanitized_columns = list()
|
|
61
60
|
for feature in use_columns:
|
|
62
61
|
if feature not in ground_truth_cols:
|
|
63
|
-
|
|
64
|
-
_LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
62
|
+
_LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
65
63
|
else:
|
|
66
64
|
sanitized_columns.append(feature)
|
|
67
65
|
|
|
68
66
|
if ignore_columns is not None and use_columns is None:
|
|
69
67
|
missing_ignore = set(ignore_columns) - set(ground_truth_cols)
|
|
70
|
-
if missing_ignore
|
|
71
|
-
_LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
|
|
68
|
+
if missing_ignore:
|
|
69
|
+
_LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not found in the Dataframe:\n{missing_ignore}")
|
|
72
70
|
sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
|
|
73
71
|
|
|
74
72
|
X = df[sanitized_columns].copy()
|
|
@@ -139,7 +137,7 @@ def compute_vif(
|
|
|
139
137
|
filename += ".svg"
|
|
140
138
|
full_save_path = save_path / filename
|
|
141
139
|
plt.savefig(full_save_path, format='svg', bbox_inches='tight')
|
|
142
|
-
|
|
140
|
+
_LOGGER.info(f"✅ Saved VIF plot: '{filename}'")
|
|
143
141
|
|
|
144
142
|
if show_plot:
|
|
145
143
|
plt.show()
|
|
@@ -164,11 +162,16 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
|
|
|
164
162
|
"""
|
|
165
163
|
# Ensure expected structure
|
|
166
164
|
if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
|
|
167
|
-
raise ValueError("
|
|
165
|
+
raise ValueError("'vif_df' must contain 'feature' and 'VIF' columns.")
|
|
168
166
|
|
|
169
167
|
# Identify features to drop
|
|
170
168
|
to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
|
|
171
|
-
|
|
169
|
+
if len(to_drop) > 0:
|
|
170
|
+
_LOGGER.info(f"🗑️ Dropping {len(to_drop)} column(s) with VIF > {threshold}:")
|
|
171
|
+
for dropped_column in to_drop:
|
|
172
|
+
print(f"\t{dropped_column}")
|
|
173
|
+
else:
|
|
174
|
+
_LOGGER.info(f"No columns exceed the VIF threshold of '{threshold}'.")
|
|
172
175
|
|
|
173
176
|
result_df = df.drop(columns=to_drop)
|
|
174
177
|
|
|
@@ -186,7 +189,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
|
|
|
186
189
|
max_features_to_plot: int = 20,
|
|
187
190
|
fontsize: int = 14):
|
|
188
191
|
"""
|
|
189
|
-
Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots
|
|
192
|
+
Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots will be displayed inline.
|
|
190
193
|
Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
|
|
191
194
|
|
|
192
195
|
Args:
|
|
@@ -216,8 +219,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
|
|
|
216
219
|
fontsize=fontsize,
|
|
217
220
|
save_dir=output_plot_directory,
|
|
218
221
|
filename=df_name,
|
|
219
|
-
show_plot=False
|
|
220
|
-
verbose=False)
|
|
222
|
+
show_plot=False)
|
|
221
223
|
|
|
222
224
|
if output_dataset_path is not None:
|
|
223
225
|
new_filename = df_name + '_VIF'
|
|
@@ -143,7 +143,7 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
143
143
|
feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
|
|
144
144
|
rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
|
|
145
145
|
if len(rows_to_drop) > 0:
|
|
146
|
-
print(f"
|
|
146
|
+
print(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
|
|
147
147
|
df_clean = df_clean.drop(index=rows_to_drop)
|
|
148
148
|
else:
|
|
149
149
|
print(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
|
|
@@ -36,7 +36,7 @@ def find_excel_files(
|
|
|
36
36
|
input_path = make_fullpath(directory)
|
|
37
37
|
|
|
38
38
|
if not input_path.is_dir():
|
|
39
|
-
raise NotADirectoryError(f"Directory not found: {input_path}")
|
|
39
|
+
raise NotADirectoryError(f"❌ Directory not found: {input_path}")
|
|
40
40
|
|
|
41
41
|
excel_files = [
|
|
42
42
|
f for f in input_path.iterdir()
|
|
@@ -46,7 +46,7 @@ def find_excel_files(
|
|
|
46
46
|
]
|
|
47
47
|
|
|
48
48
|
if not excel_files:
|
|
49
|
-
raise FileNotFoundError(f"No valid Excel files found in directory: {input_path}")
|
|
49
|
+
raise FileNotFoundError(f"❌ No valid Excel files found in directory: {input_path}")
|
|
50
50
|
|
|
51
51
|
return excel_files
|
|
52
52
|
|
|
@@ -198,7 +198,7 @@ def validate_excel_schema(
|
|
|
198
198
|
invalid_files.append(file)
|
|
199
199
|
|
|
200
200
|
except Exception as e:
|
|
201
|
-
_LOGGER.error(f"Error processing '{file}': {e}")
|
|
201
|
+
_LOGGER.error(f"❌ Error processing '{file}': {e}")
|
|
202
202
|
invalid_files.append(file)
|
|
203
203
|
|
|
204
204
|
valid_excel_number = len(excel_paths) - len(invalid_files)
|
|
@@ -251,7 +251,7 @@ def vertical_merge_transform_excel(
|
|
|
251
251
|
if target_columns is not None:
|
|
252
252
|
missing = [col for col in target_columns if col not in df.columns]
|
|
253
253
|
if missing:
|
|
254
|
-
raise ValueError(f"Invalid columns in {file.name}: {missing}")
|
|
254
|
+
raise ValueError(f"❌ Invalid columns in {file.name}: {missing}")
|
|
255
255
|
df = df[target_columns]
|
|
256
256
|
|
|
257
257
|
dataframes.append(df)
|
|
@@ -261,7 +261,7 @@ def vertical_merge_transform_excel(
|
|
|
261
261
|
if rename_columns is not None:
|
|
262
262
|
expected_len = len(target_columns if target_columns is not None else merged_df.columns)
|
|
263
263
|
if len(rename_columns) != expected_len:
|
|
264
|
-
raise ValueError("Length of 'rename_columns' must match the selected columns")
|
|
264
|
+
raise ValueError("❌ Length of 'rename_columns' must match the selected columns")
|
|
265
265
|
merged_df.columns = rename_columns
|
|
266
266
|
|
|
267
267
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
@@ -324,6 +324,9 @@ def horizontal_merge_transform_excel(
|
|
|
324
324
|
merged_df = pd.concat(padded_dataframes, axis=1)
|
|
325
325
|
|
|
326
326
|
duplicate_columns = merged_df.columns[merged_df.columns.duplicated()].tolist()
|
|
327
|
+
|
|
328
|
+
if duplicate_columns:
|
|
329
|
+
_LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
|
|
327
330
|
|
|
328
331
|
if skip_duplicates:
|
|
329
332
|
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
|
|
@@ -344,9 +347,7 @@ def horizontal_merge_transform_excel(
|
|
|
344
347
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
345
348
|
|
|
346
349
|
_LOGGER.info(f"✅ Merged {len(excel_files)} Excel files into '{csv_filename}'.")
|
|
347
|
-
|
|
348
|
-
_LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
|
|
349
|
-
|
|
350
|
+
|
|
350
351
|
|
|
351
352
|
def info():
|
|
352
353
|
_script_info(__all__)
|
|
@@ -102,7 +102,7 @@ class PathManager:
|
|
|
102
102
|
for key in new_paths:
|
|
103
103
|
if key in self._paths:
|
|
104
104
|
raise KeyError(
|
|
105
|
-
f"Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True."
|
|
105
|
+
f"❌ Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True."
|
|
106
106
|
)
|
|
107
107
|
|
|
108
108
|
# Resolve any string paths to Path objects before storing
|
|
@@ -32,28 +32,42 @@ __all__ = [
|
|
|
32
32
|
def make_fullpath(
|
|
33
33
|
input_path: Union[str, Path],
|
|
34
34
|
make: bool = False,
|
|
35
|
-
verbose: bool = False
|
|
35
|
+
verbose: bool = False,
|
|
36
|
+
enforce: Optional[Literal["directory", "file"]] = None
|
|
36
37
|
) -> Path:
|
|
37
38
|
"""
|
|
38
|
-
Resolves a string or Path into an absolute Path.
|
|
39
|
+
Resolves a string or Path into an absolute Path, optionally creating it.
|
|
39
40
|
|
|
40
41
|
- If the path exists, it is returned.
|
|
41
42
|
- If the path does not exist and `make=True`, it will:
|
|
42
|
-
- Create the file if the path has a suffix
|
|
43
|
+
- Create the file if the path has a suffix
|
|
43
44
|
- Create the directory if it has no suffix
|
|
44
45
|
- If `make=False` and the path does not exist, an error is raised.
|
|
46
|
+
- If `enforce`, raises an error if the resolved path is not what was enforced.
|
|
45
47
|
- Optionally prints whether the resolved path is a file or directory.
|
|
46
48
|
|
|
47
49
|
Parameters:
|
|
48
|
-
input_path (str | Path):
|
|
49
|
-
|
|
50
|
-
|
|
50
|
+
input_path (str | Path):
|
|
51
|
+
Path to resolve.
|
|
52
|
+
make (bool):
|
|
53
|
+
If True, attempt to create file or directory.
|
|
54
|
+
verbose (bool):
|
|
55
|
+
Print classification after resolution.
|
|
56
|
+
enforce ("directory" | "file" | None):
|
|
57
|
+
Raises an error if the resolved path is not what was enforced.
|
|
51
58
|
|
|
52
59
|
Returns:
|
|
53
60
|
Path: Resolved absolute path.
|
|
54
61
|
|
|
55
62
|
Raises:
|
|
56
63
|
ValueError: If the path doesn't exist and can't be created.
|
|
64
|
+
TypeError: If the final path does not match the `enforce` parameter.
|
|
65
|
+
|
|
66
|
+
## 🗒️ Note:
|
|
67
|
+
|
|
68
|
+
Directories with dots will be treated as files.
|
|
69
|
+
|
|
70
|
+
Files without extension will be treated as directories.
|
|
57
71
|
"""
|
|
58
72
|
path = Path(input_path).expanduser()
|
|
59
73
|
|
|
@@ -75,6 +89,12 @@ def make_fullpath(
|
|
|
75
89
|
resolved = path.resolve(strict=True)
|
|
76
90
|
except Exception as e:
|
|
77
91
|
raise ValueError(f"❌ Failed to create {'file' if is_file else 'directory'} '{path}': {e}")
|
|
92
|
+
|
|
93
|
+
if enforce == "file" and not resolved.is_file():
|
|
94
|
+
raise TypeError(f"❌ Path was enforced as a file, but it is not: '{resolved}'")
|
|
95
|
+
|
|
96
|
+
if enforce == "directory" and not resolved.is_dir():
|
|
97
|
+
raise TypeError(f"❌ Path was enforced as a directory, but it is not: '{resolved}'")
|
|
78
98
|
|
|
79
99
|
if verbose:
|
|
80
100
|
if resolved.is_file():
|
|
@@ -87,7 +107,7 @@ def make_fullpath(
|
|
|
87
107
|
return resolved
|
|
88
108
|
|
|
89
109
|
|
|
90
|
-
def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
|
|
110
|
+
def list_csv_paths(directory: Union[str,Path], verbose: bool=True) -> dict[str, Path]:
|
|
91
111
|
"""
|
|
92
112
|
Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
|
|
93
113
|
|
|
@@ -101,19 +121,20 @@ def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
|
|
|
101
121
|
|
|
102
122
|
csv_paths = list(dir_path.glob("*.csv"))
|
|
103
123
|
if not csv_paths:
|
|
104
|
-
raise IOError(f"No CSV files found in directory: {dir_path.name}")
|
|
124
|
+
raise IOError(f"❌ No CSV files found in directory: {dir_path.name}")
|
|
105
125
|
|
|
106
126
|
# make a dictionary of paths and names
|
|
107
127
|
name_path_dict = {p.stem: p for p in csv_paths}
|
|
108
128
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
129
|
+
if verbose:
|
|
130
|
+
print("\n🗂️ CSV files found:")
|
|
131
|
+
for name in name_path_dict.keys():
|
|
132
|
+
print(f"\t{name}")
|
|
112
133
|
|
|
113
134
|
return name_path_dict
|
|
114
135
|
|
|
115
136
|
|
|
116
|
-
def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[str, Path]:
|
|
137
|
+
def list_files_by_extension(directory: Union[str,Path], extension: str, verbose: bool=True) -> dict[str, Path]:
|
|
117
138
|
"""
|
|
118
139
|
Lists all files with the specified extension in the given directory and returns a mapping:
|
|
119
140
|
filenames (without extensions) to their absolute paths.
|
|
@@ -133,13 +154,14 @@ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[
|
|
|
133
154
|
|
|
134
155
|
matched_paths = list(dir_path.glob(pattern))
|
|
135
156
|
if not matched_paths:
|
|
136
|
-
raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
|
|
157
|
+
raise IOError(f"❌ No '.{normalized_ext}' files found in directory: {dir_path}")
|
|
137
158
|
|
|
138
159
|
name_path_dict = {p.stem: p for p in matched_paths}
|
|
139
160
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
161
|
+
if verbose:
|
|
162
|
+
print(f"\n📂 '{normalized_ext.upper()}' files found:")
|
|
163
|
+
for name in name_path_dict:
|
|
164
|
+
print(f"\t{name}")
|
|
143
165
|
|
|
144
166
|
return name_path_dict
|
|
145
167
|
|
|
@@ -147,7 +169,8 @@ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[
|
|
|
147
169
|
def load_dataframe(
|
|
148
170
|
df_path: Union[str, Path],
|
|
149
171
|
kind: Literal["pandas", "polars"] = "pandas",
|
|
150
|
-
all_strings: bool = False
|
|
172
|
+
all_strings: bool = False,
|
|
173
|
+
verbose: bool = True
|
|
151
174
|
) -> Tuple[Union[pd.DataFrame, pl.DataFrame], str]:
|
|
152
175
|
"""
|
|
153
176
|
Load a CSV file into a DataFrame and extract its base name.
|
|
@@ -191,20 +214,21 @@ def load_dataframe(
|
|
|
191
214
|
df = pl.read_csv(path, infer_schema_length=1000)
|
|
192
215
|
|
|
193
216
|
else:
|
|
194
|
-
raise ValueError(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
|
|
217
|
+
raise ValueError(f"❌ Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
|
|
195
218
|
|
|
196
219
|
# This check works for both pandas and polars DataFrames
|
|
197
220
|
if df.shape[0] == 0:
|
|
198
|
-
raise ValueError(f"DataFrame '{df_name}' loaded from '{path}' is empty.")
|
|
221
|
+
raise ValueError(f"❌ DataFrame '{df_name}' loaded from '{path}' is empty.")
|
|
199
222
|
|
|
200
|
-
|
|
223
|
+
if verbose:
|
|
224
|
+
print(f"\n💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
|
|
201
225
|
|
|
202
226
|
return df, df_name
|
|
203
227
|
|
|
204
228
|
|
|
205
|
-
def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
|
|
229
|
+
def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
|
|
206
230
|
"""
|
|
207
|
-
Iterates over all CSV files in a given directory, loading each into a
|
|
231
|
+
Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
|
|
208
232
|
|
|
209
233
|
Parameters:
|
|
210
234
|
datasets_dir (str | Path):
|
|
@@ -221,9 +245,10 @@ def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
|
|
|
221
245
|
- Output is streamed via a generator to support lazy loading of multiple datasets.
|
|
222
246
|
"""
|
|
223
247
|
datasets_path = make_fullpath(datasets_dir)
|
|
224
|
-
|
|
248
|
+
files_dict = list_csv_paths(datasets_path, verbose=verbose)
|
|
249
|
+
for df_name, df_path in files_dict.items():
|
|
225
250
|
df: pd.DataFrame
|
|
226
|
-
df, _ = load_dataframe(df_path, kind="pandas") # type: ignore
|
|
251
|
+
df, _ = load_dataframe(df_path, kind="pandas", verbose=verbose) # type: ignore
|
|
227
252
|
yield df, df_name
|
|
228
253
|
|
|
229
254
|
|
|
@@ -253,35 +278,35 @@ def merge_dataframes(
|
|
|
253
278
|
- If column names or order differ for vertical merge.
|
|
254
279
|
"""
|
|
255
280
|
if len(dfs) < 2:
|
|
256
|
-
raise ValueError("At least 2 DataFrames must be provided.")
|
|
281
|
+
raise ValueError("❌ At least 2 DataFrames must be provided.")
|
|
257
282
|
|
|
258
283
|
if verbose:
|
|
259
284
|
for i, df in enumerate(dfs, start=1):
|
|
260
|
-
print(f"DataFrame {i} shape: {df.shape}")
|
|
285
|
+
print(f"➡️ DataFrame {i} shape: {df.shape}")
|
|
261
286
|
|
|
262
287
|
|
|
263
288
|
if direction == "horizontal":
|
|
264
289
|
reference_index = dfs[0].index
|
|
265
290
|
for i, df in enumerate(dfs, start=1):
|
|
266
291
|
if not df.index.equals(reference_index):
|
|
267
|
-
raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
|
|
292
|
+
raise ValueError(f"❌ Indexes do not match: Dataset 1 and Dataset {i}.")
|
|
268
293
|
merged_df = pd.concat(dfs, axis=1)
|
|
269
294
|
|
|
270
295
|
elif direction == "vertical":
|
|
271
296
|
reference_columns = dfs[0].columns
|
|
272
297
|
for i, df in enumerate(dfs, start=1):
|
|
273
298
|
if not df.columns.equals(reference_columns):
|
|
274
|
-
raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
|
|
299
|
+
raise ValueError(f"❌ Column names/order do not match: Dataset 1 and Dataset {i}.")
|
|
275
300
|
merged_df = pd.concat(dfs, axis=0)
|
|
276
301
|
|
|
277
302
|
else:
|
|
278
|
-
raise ValueError(f"Invalid merge direction: {direction}")
|
|
303
|
+
raise ValueError(f"❌ Invalid merge direction: {direction}")
|
|
279
304
|
|
|
280
305
|
if reset_index:
|
|
281
306
|
merged_df = merged_df.reset_index(drop=True)
|
|
282
307
|
|
|
283
308
|
if verbose:
|
|
284
|
-
print(f"Merged DataFrame shape: {merged_df.shape}")
|
|
309
|
+
print(f"\n✅ Merged DataFrame shape: {merged_df.shape}")
|
|
285
310
|
|
|
286
311
|
return merged_df
|
|
287
312
|
|
|
@@ -320,9 +345,9 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
|
|
|
320
345
|
df.write_csv(output_path) # Polars defaults to utf8 and no index
|
|
321
346
|
else:
|
|
322
347
|
# This error handles cases where an unsupported type is passed
|
|
323
|
-
raise TypeError(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
|
|
348
|
+
raise TypeError(f"❌ Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
|
|
324
349
|
|
|
325
|
-
print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
|
|
350
|
+
print(f"\n✅ Saved dataset: '{filename}' with shape: {df.shape}")
|
|
326
351
|
|
|
327
352
|
|
|
328
353
|
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
@@ -356,7 +381,7 @@ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
|
356
381
|
|
|
357
382
|
# Raise for negative values
|
|
358
383
|
if any(x < 0 for x in float_list):
|
|
359
|
-
raise ValueError("Negative values are not allowed in the input list.")
|
|
384
|
+
raise ValueError("❌ Negative values are not allowed in the input list.")
|
|
360
385
|
|
|
361
386
|
# Step 2: Compute log10 of non-zero values
|
|
362
387
|
nonzero = [x for x in float_list if x > 0]
|
|
@@ -395,7 +420,7 @@ def sanitize_filename(filename: str) -> str:
|
|
|
395
420
|
- Removing or replacing characters invalid in filenames.
|
|
396
421
|
|
|
397
422
|
Args:
|
|
398
|
-
|
|
423
|
+
filename (str): Base filename.
|
|
399
424
|
|
|
400
425
|
Returns:
|
|
401
426
|
str: A sanitized string suitable to use as a filename.
|
|
@@ -408,6 +433,10 @@ def sanitize_filename(filename: str) -> str:
|
|
|
408
433
|
|
|
409
434
|
# Conservative filter to keep filenames safe across platforms
|
|
410
435
|
sanitized = re.sub(r'[^\w\-.]', '', sanitized)
|
|
436
|
+
|
|
437
|
+
# Check for empty string after sanitization
|
|
438
|
+
if not sanitized:
|
|
439
|
+
raise ValueError("The sanitized filename is empty. The original input may have contained only invalid characters.")
|
|
411
440
|
|
|
412
441
|
return sanitized
|
|
413
442
|
|
|
@@ -418,6 +447,8 @@ def threshold_binary_values(
|
|
|
418
447
|
) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
|
|
419
448
|
"""
|
|
420
449
|
Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
|
|
450
|
+
|
|
451
|
+
Binary elements are converted to 0 or 1 using a 0.5 threshold.
|
|
421
452
|
|
|
422
453
|
Parameters:
|
|
423
454
|
input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
|
|
@@ -426,7 +457,8 @@ def threshold_binary_values(
|
|
|
426
457
|
- If `int`, only this many last `binary_values` are thresholded.
|
|
427
458
|
|
|
428
459
|
Returns:
|
|
429
|
-
|
|
460
|
+
Any:
|
|
461
|
+
Same type as input
|
|
430
462
|
"""
|
|
431
463
|
original_type = type(input_array)
|
|
432
464
|
|
|
@@ -437,14 +469,14 @@ def threshold_binary_values(
|
|
|
437
469
|
elif isinstance(input_array, (list, tuple)):
|
|
438
470
|
array = np.array(input_array)
|
|
439
471
|
else:
|
|
440
|
-
raise TypeError("Unsupported input type")
|
|
472
|
+
raise TypeError("❌ Unsupported input type")
|
|
441
473
|
|
|
442
474
|
array = array.flatten()
|
|
443
475
|
total = array.shape[0]
|
|
444
476
|
|
|
445
477
|
bin_count = total if binary_values is None else binary_values
|
|
446
478
|
if not (0 <= bin_count <= total):
|
|
447
|
-
raise ValueError("binary_values must be between 0 and the total number of elements")
|
|
479
|
+
raise ValueError("❌ binary_values must be between 0 and the total number of elements")
|
|
448
480
|
|
|
449
481
|
if bin_count == 0:
|
|
450
482
|
result = array
|
|
@@ -484,9 +516,9 @@ def threshold_binary_values_batch(
|
|
|
484
516
|
np.ndarray
|
|
485
517
|
Thresholded array, same shape as input.
|
|
486
518
|
"""
|
|
487
|
-
assert input_array.ndim == 2, f"Expected 2D array, got {input_array.ndim}D"
|
|
519
|
+
assert input_array.ndim == 2, f"❌ Expected 2D array, got {input_array.ndim}D"
|
|
488
520
|
batch_size, total_features = input_array.shape
|
|
489
|
-
assert 0 <= binary_values <= total_features, "binary_values out of valid range"
|
|
521
|
+
assert 0 <= binary_values <= total_features, "❌ binary_values out of valid range"
|
|
490
522
|
|
|
491
523
|
if binary_values == 0:
|
|
492
524
|
return input_array.copy()
|
|
@@ -523,7 +555,7 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
|
|
|
523
555
|
return None
|
|
524
556
|
else:
|
|
525
557
|
if verbose:
|
|
526
|
-
print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
|
|
558
|
+
print(f"\n✅ Object of type '{type(obj)}' saved to '{full_path}'")
|
|
527
559
|
return None
|
|
528
560
|
|
|
529
561
|
|
|
@@ -550,7 +582,7 @@ def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_e
|
|
|
550
582
|
return None
|
|
551
583
|
else:
|
|
552
584
|
if verbose:
|
|
553
|
-
print(f"✅ Loaded object of type '{type(obj)}'")
|
|
585
|
+
print(f"\n✅ Loaded object of type '{type(obj)}'")
|
|
554
586
|
return obj
|
|
555
587
|
|
|
556
588
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/requires.txt
RENAMED
|
File without changes
|
{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|