dragon-ml-toolbox 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/RECORD +9 -9
- ml_tools/ETL_engineering.py +223 -2
- ml_tools/PSO_optimization.py +173 -67
- ml_tools/utilities.py +0 -1
- {dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
dragon_ml_toolbox-2.
|
|
2
|
-
dragon_ml_toolbox-2.
|
|
3
|
-
ml_tools/ETL_engineering.py,sha256=
|
|
1
|
+
dragon_ml_toolbox-2.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-2.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
|
|
3
|
+
ml_tools/ETL_engineering.py,sha256=ns8HsLWZhByurvjtUUW10p7If1h1O5-btUfCRXxzkME,31568
|
|
4
4
|
ml_tools/MICE_imputation.py,sha256=1fovHycZMdZ6OgVh_bk8-r3wGi4rqf6rS10LOEWYaQo,11177
|
|
5
|
-
ml_tools/PSO_optimization.py,sha256=
|
|
5
|
+
ml_tools/PSO_optimization.py,sha256=gi56mF-q6BApYwhAd9jix0xiYz595WTPcUh7afZsRJ4,25378
|
|
6
6
|
ml_tools/VIF_factor.py,sha256=lpM3Z2X_iZfXUWbCbURoeI0Tb196lU0bAsRo7q6AzBM,10235
|
|
7
7
|
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
|
|
@@ -13,9 +13,9 @@ ml_tools/handle_excel.py,sha256=Uasx-DX7RNVQSzGHVJhX7UQ9RgBbX5H1ud1Hw_y8Kp4,1294
|
|
|
13
13
|
ml_tools/logger.py,sha256=_k7WJdpFJj3IsjOgvjLJgUFZyF8RK3Jlgp5tAu_dLQU,4767
|
|
14
14
|
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
15
15
|
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
16
|
-
ml_tools/utilities.py,sha256=
|
|
16
|
+
ml_tools/utilities.py,sha256=T6AnNEQjUDnMAMSIJ8yZqToAVESIlEKK0bGBEm3sAUU,20670
|
|
17
17
|
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
18
|
-
dragon_ml_toolbox-2.
|
|
19
|
-
dragon_ml_toolbox-2.
|
|
20
|
-
dragon_ml_toolbox-2.
|
|
21
|
-
dragon_ml_toolbox-2.
|
|
18
|
+
dragon_ml_toolbox-2.3.0.dist-info/METADATA,sha256=4wivV_JKPd83xNzf6xzSfCwxiZgvYL5uW4yE6Da8tnU,2974
|
|
19
|
+
dragon_ml_toolbox-2.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
+
dragon_ml_toolbox-2.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
21
|
+
dragon_ml_toolbox-2.3.0.dist-info/RECORD,,
|
ml_tools/ETL_engineering.py
CHANGED
|
@@ -2,19 +2,120 @@ import polars as pl
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import Literal, Union, Optional, Any, Callable, List, Dict
|
|
4
4
|
from .utilities import _script_info
|
|
5
|
+
import pandas as pd
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
__all__ = [
|
|
9
|
+
"ColumnCleaner",
|
|
10
|
+
"DataFrameCleaner"
|
|
8
11
|
"TransformationRecipe",
|
|
9
12
|
"DataProcessor",
|
|
10
13
|
"KeywordDummifier",
|
|
11
14
|
"NumberExtractor",
|
|
12
15
|
"MultiNumberExtractor",
|
|
16
|
+
"RatioCalculator"
|
|
13
17
|
"CategoryMapper",
|
|
18
|
+
"RegexMapper",
|
|
14
19
|
"ValueBinner",
|
|
15
20
|
"DateFeatureExtractor"
|
|
16
21
|
]
|
|
17
22
|
|
|
23
|
+
########## EXTRACT and CLEAN ##########
|
|
24
|
+
|
|
25
|
+
class ColumnCleaner:
|
|
26
|
+
"""
|
|
27
|
+
Cleans and standardizes a single pandas Series based on a dictionary of regex-to-value replacement rules.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
rules (Dict[str, str]):
|
|
31
|
+
A dictionary where each key is a regular expression pattern and
|
|
32
|
+
each value is the standardized string to replace matches with.
|
|
33
|
+
"""
|
|
34
|
+
def __init__(self, rules: Dict[str, str]):
|
|
35
|
+
if not isinstance(rules, dict):
|
|
36
|
+
raise TypeError("The 'rules' argument must be a dictionary.")
|
|
37
|
+
|
|
38
|
+
# Validate that all keys are valid regular expressions
|
|
39
|
+
for pattern in rules.keys():
|
|
40
|
+
try:
|
|
41
|
+
re.compile(pattern)
|
|
42
|
+
except re.error as e:
|
|
43
|
+
raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
|
|
44
|
+
|
|
45
|
+
self.rules = rules
|
|
46
|
+
|
|
47
|
+
def clean(self, series: pd.Series) -> pd.Series:
|
|
48
|
+
"""
|
|
49
|
+
Applies the standardization rules to the provided Series (requires string data).
|
|
50
|
+
|
|
51
|
+
Non-matching values are kept as they are.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
series (pd.Series): The pandas Series to clean.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
pd.Series: A new Series with the values cleaned and standardized.
|
|
58
|
+
"""
|
|
59
|
+
return series.astype(str).replace(self.rules, regex=True)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DataFrameCleaner:
|
|
63
|
+
"""
|
|
64
|
+
Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
rules (Dict[str, Dict[str, str]]):
|
|
68
|
+
A nested dictionary where each top-level key is a column name,
|
|
69
|
+
and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
|
|
70
|
+
"""
|
|
71
|
+
def __init__(self, rules: Dict[str, Dict[str, str]]):
|
|
72
|
+
if not isinstance(rules, dict):
|
|
73
|
+
raise TypeError("The 'rules' argument must be a nested dictionary.")
|
|
74
|
+
|
|
75
|
+
for col_name, col_rules in rules.items():
|
|
76
|
+
if not isinstance(col_rules, dict):
|
|
77
|
+
raise TypeError(
|
|
78
|
+
f"The value for column '{col_name}' must be a dictionary "
|
|
79
|
+
f"of rules, but got type {type(col_rules).__name__}."
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
self.rules = rules
|
|
83
|
+
|
|
84
|
+
def clean(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
85
|
+
"""
|
|
86
|
+
Applies all defined cleaning rules to the DataFrame.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
df (pd.DataFrame): The pandas DataFrame to clean.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
pd.DataFrame: A new, cleaned DataFrame.
|
|
93
|
+
"""
|
|
94
|
+
rule_columns = set(self.rules.keys())
|
|
95
|
+
df_columns = set(df.columns)
|
|
96
|
+
|
|
97
|
+
missing_columns = rule_columns - df_columns
|
|
98
|
+
|
|
99
|
+
if missing_columns:
|
|
100
|
+
# Report all missing columns in a single, clear error message
|
|
101
|
+
raise ValueError(
|
|
102
|
+
f"The following columns specified in the cleaning rules "
|
|
103
|
+
f"were not found in the DataFrame: {sorted(list(missing_columns))}"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Start the process
|
|
107
|
+
df_cleaned = df.copy()
|
|
108
|
+
|
|
109
|
+
for column_name, column_rules in self.rules.items():
|
|
110
|
+
# Create and apply the specific cleaner for the column
|
|
111
|
+
cleaner = ColumnCleaner(rules=column_rules)
|
|
112
|
+
df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
|
|
113
|
+
|
|
114
|
+
return df_cleaned
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
############ TRANSFORM ####################
|
|
118
|
+
|
|
18
119
|
# Magic word for rename-only transformation
|
|
19
120
|
_RENAME = "rename"
|
|
20
121
|
|
|
@@ -336,8 +437,7 @@ class MultiNumberExtractor:
|
|
|
336
437
|
"""
|
|
337
438
|
Extracts multiple numbers from a single polars string column into several new columns.
|
|
338
439
|
|
|
339
|
-
This transformer is designed for one-to-many mappings, such as parsing
|
|
340
|
-
ratios (100:30) or coordinates (10, 25) into separate columns.
|
|
440
|
+
This transformer is designed for one-to-many mappings, such as parsing coordinates (10, 25) into separate columns.
|
|
341
441
|
|
|
342
442
|
Args:
|
|
343
443
|
num_outputs (int):
|
|
@@ -413,6 +513,59 @@ class MultiNumberExtractor:
|
|
|
413
513
|
return pl.select(output_expressions)
|
|
414
514
|
|
|
415
515
|
|
|
516
|
+
class RatioCalculator:
|
|
517
|
+
"""
|
|
518
|
+
A transformer that parses a string ratio (e.g., "40:5" or "30/2") and computes the result of the division.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
regex_pattern (str, optional):
|
|
522
|
+
The regex pattern to find the numerator and denominator. It MUST
|
|
523
|
+
contain exactly two capturing groups: the first for the
|
|
524
|
+
numerator and the second for the denominator. Defaults to a
|
|
525
|
+
pattern that handles common delimiters like ':' and '/'.
|
|
526
|
+
"""
|
|
527
|
+
def __init__(
|
|
528
|
+
self,
|
|
529
|
+
regex_pattern: str = r"(\d+\.?\d*)\s*[:/]\s*(\d+\.?\d*)"
|
|
530
|
+
):
|
|
531
|
+
# --- Validation ---
|
|
532
|
+
try:
|
|
533
|
+
if re.compile(regex_pattern).groups != 2:
|
|
534
|
+
raise ValueError(
|
|
535
|
+
"regex_pattern must contain exactly two "
|
|
536
|
+
"capturing groups '(...)'."
|
|
537
|
+
)
|
|
538
|
+
except re.error as e:
|
|
539
|
+
raise ValueError(f"Invalid regex pattern provided: {e}") from e
|
|
540
|
+
|
|
541
|
+
self.regex_pattern = regex_pattern
|
|
542
|
+
|
|
543
|
+
def __call__(self, column: pl.Series) -> pl.Series:
|
|
544
|
+
"""
|
|
545
|
+
Applies the ratio calculation logic to the input column.
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
column (pl.Series): The input Polars Series of ratio strings.
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
pl.Series: A new Series of floats containing the division result.
|
|
552
|
+
Returns null for invalid formats or division by zero.
|
|
553
|
+
"""
|
|
554
|
+
# .extract_groups returns a struct with a field for each capture group
|
|
555
|
+
# e.g., {"group_1": "40", "group_2": "5"}
|
|
556
|
+
groups = column.str.extract_groups(self.regex_pattern)
|
|
557
|
+
|
|
558
|
+
# Extract numerator and denominator, casting to float
|
|
559
|
+
# strict=False ensures that non-matches become null
|
|
560
|
+
numerator = groups.struct.field("group_1").cast(pl.Float64, strict=False)
|
|
561
|
+
denominator = groups.struct.field("group_2").cast(pl.Float64, strict=False)
|
|
562
|
+
|
|
563
|
+
# Safely perform division, returning null if denominator is 0
|
|
564
|
+
return pl.when(denominator != 0).then(
|
|
565
|
+
numerator / denominator
|
|
566
|
+
).otherwise(None)
|
|
567
|
+
|
|
568
|
+
|
|
416
569
|
class CategoryMapper:
|
|
417
570
|
"""
|
|
418
571
|
A transformer that maps string categories to specified numerical values using a dictionary.
|
|
@@ -468,6 +621,74 @@ class CategoryMapper:
|
|
|
468
621
|
return pl.select(final_expr).to_series()
|
|
469
622
|
|
|
470
623
|
|
|
624
|
+
class RegexMapper:
|
|
625
|
+
"""
|
|
626
|
+
A transformer that maps string categories to numerical values based on a
|
|
627
|
+
dictionary of regular expression patterns.
|
|
628
|
+
|
|
629
|
+
The class iterates through the mapping dictionary in order, and the first
|
|
630
|
+
pattern that matches a given string determines the output value. This
|
|
631
|
+
"first match wins" logic makes the order of the mapping important.
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
mapping (Dict[str, Union[int, float]]):
|
|
635
|
+
An ordered dictionary where keys are regex patterns and values are
|
|
636
|
+
the numbers to map to if the pattern is found.
|
|
637
|
+
unseen_value (Optional[Union[int, float]], optional):
|
|
638
|
+
The numerical value to use for strings that do not match any
|
|
639
|
+
of the regex patterns. If None (default), unseen values are
|
|
640
|
+
mapped to null.
|
|
641
|
+
"""
|
|
642
|
+
def __init__(
|
|
643
|
+
self,
|
|
644
|
+
mapping: Dict[str, Union[int, float]],
|
|
645
|
+
unseen_value: Optional[Union[int, float]] = None,
|
|
646
|
+
):
|
|
647
|
+
# --- Validation ---
|
|
648
|
+
if not isinstance(mapping, dict):
|
|
649
|
+
raise TypeError("The 'mapping' argument must be a dictionary.")
|
|
650
|
+
|
|
651
|
+
for pattern, value in mapping.items():
|
|
652
|
+
try:
|
|
653
|
+
re.compile(pattern)
|
|
654
|
+
except re.error as e:
|
|
655
|
+
raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
|
|
656
|
+
if not isinstance(value, (int, float)):
|
|
657
|
+
raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
|
|
658
|
+
|
|
659
|
+
self.mapping = mapping
|
|
660
|
+
self.unseen_value = unseen_value
|
|
661
|
+
|
|
662
|
+
def __call__(self, column: pl.Series) -> pl.Series:
|
|
663
|
+
"""
|
|
664
|
+
Applies the regex mapping logic to the input column.
|
|
665
|
+
|
|
666
|
+
Args:
|
|
667
|
+
column (pl.Series): The input Polars Series of string data.
|
|
668
|
+
|
|
669
|
+
Returns:
|
|
670
|
+
pl.Series: A new Series with strings mapped to numbers based on
|
|
671
|
+
the first matching regex pattern.
|
|
672
|
+
"""
|
|
673
|
+
# Ensure the column is treated as a string for matching
|
|
674
|
+
str_column = column.cast(pl.Utf8)
|
|
675
|
+
|
|
676
|
+
# Build the when/then/otherwise chain from the inside out.
|
|
677
|
+
# Start with the final fallback value for non-matches.
|
|
678
|
+
mapping_expr = pl.lit(self.unseen_value)
|
|
679
|
+
|
|
680
|
+
# Iterate through the mapping in reverse to construct the nested expression
|
|
681
|
+
for pattern, value in reversed(list(self.mapping.items())):
|
|
682
|
+
mapping_expr = (
|
|
683
|
+
pl.when(str_column.str.contains(pattern))
|
|
684
|
+
.then(pl.lit(value))
|
|
685
|
+
.otherwise(mapping_expr)
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# Execute the complete expression chain and return the resulting Series
|
|
689
|
+
return pl.select(mapping_expr).to_series()
|
|
690
|
+
|
|
691
|
+
|
|
471
692
|
class ValueBinner:
|
|
472
693
|
"""
|
|
473
694
|
A transformer that discretizes a continuous numerical column into a finite number of bins.
|
ml_tools/PSO_optimization.py
CHANGED
|
@@ -7,15 +7,27 @@ from sklearn.base import ClassifierMixin
|
|
|
7
7
|
from typing import Literal, Union, Tuple, Dict, Optional
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from copy import deepcopy
|
|
10
|
-
from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath
|
|
10
|
+
from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath, yield_dataframes_from_dir, sanitize_filename
|
|
11
11
|
import torch
|
|
12
12
|
from tqdm import trange
|
|
13
|
+
import logging
|
|
14
|
+
import matplotlib.pyplot as plt
|
|
15
|
+
import seaborn as sns
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
|
|
18
|
+
# Configure logger
|
|
19
|
+
logging.basicConfig(
|
|
20
|
+
level=logging.INFO,
|
|
21
|
+
format="[%(asctime)s] [%(levelname)s] - %(message)s",
|
|
22
|
+
datefmt="%Y-%m-%d %H:%M:%S"
|
|
23
|
+
)
|
|
13
24
|
|
|
14
25
|
|
|
15
26
|
__all__ = [
|
|
16
27
|
"ObjectiveFunction",
|
|
17
28
|
"multiple_objective_functions_from_dir",
|
|
18
|
-
"run_pso"
|
|
29
|
+
"run_pso",
|
|
30
|
+
"plot_optimal_feature_distributions"
|
|
19
31
|
]
|
|
20
32
|
|
|
21
33
|
|
|
@@ -184,6 +196,52 @@ def _save_results(*dicts, save_dir: Union[str,Path], target_name: str):
|
|
|
184
196
|
save_dataframe(df=df, save_dir=save_dir, filename=f"Optimization_{target_name}")
|
|
185
197
|
|
|
186
198
|
|
|
199
|
+
def _run_single_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, random_state: int):
|
|
200
|
+
"""Helper for a single PSO run."""
|
|
201
|
+
pso_args.update({"seed": random_state})
|
|
202
|
+
|
|
203
|
+
best_features, best_target, *_ = _pso(**pso_args)
|
|
204
|
+
|
|
205
|
+
# Flip best_target if maximization was used
|
|
206
|
+
if objective_function.task == "maximization":
|
|
207
|
+
best_target = -best_target
|
|
208
|
+
|
|
209
|
+
# Threshold binary features
|
|
210
|
+
binary_number = objective_function.binary_features
|
|
211
|
+
best_features_threshold = threshold_binary_values(best_features, binary_number)
|
|
212
|
+
|
|
213
|
+
# Name features and target
|
|
214
|
+
best_features_named = {name: value for name, value in zip(feature_names, best_features_threshold)}
|
|
215
|
+
best_target_named = {target_name: best_target}
|
|
216
|
+
|
|
217
|
+
return best_features_named, best_target_named
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _run_post_hoc_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, repetitions: int):
|
|
221
|
+
"""Helper for post-hoc PSO analysis."""
|
|
222
|
+
all_best_targets = []
|
|
223
|
+
all_best_features = [[] for _ in range(len(feature_names))]
|
|
224
|
+
|
|
225
|
+
for _ in range(repetitions):
|
|
226
|
+
best_features, best_target, *_ = _pso(**pso_args)
|
|
227
|
+
|
|
228
|
+
if objective_function.task == "maximization":
|
|
229
|
+
best_target = -best_target
|
|
230
|
+
|
|
231
|
+
binary_number = objective_function.binary_features
|
|
232
|
+
best_features_threshold = threshold_binary_values(best_features, binary_number)
|
|
233
|
+
|
|
234
|
+
for i, best_feature in enumerate(best_features_threshold):
|
|
235
|
+
all_best_features[i].append(best_feature)
|
|
236
|
+
all_best_targets.append(best_target)
|
|
237
|
+
|
|
238
|
+
# Name features and target
|
|
239
|
+
all_best_features_named = {name: lst for name, lst in zip(feature_names, all_best_features)}
|
|
240
|
+
all_best_targets_named = {target_name: all_best_targets}
|
|
241
|
+
|
|
242
|
+
return all_best_features_named, all_best_targets_named
|
|
243
|
+
|
|
244
|
+
|
|
187
245
|
def run_pso(lower_boundaries: list[float],
|
|
188
246
|
upper_boundaries: list[float],
|
|
189
247
|
objective_function: ObjectiveFunction,
|
|
@@ -236,6 +294,8 @@ def run_pso(lower_boundaries: list[float],
|
|
|
236
294
|
-----
|
|
237
295
|
- PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
|
|
238
296
|
"""
|
|
297
|
+
|
|
298
|
+
|
|
239
299
|
# Select device
|
|
240
300
|
if torch.cuda.is_available():
|
|
241
301
|
device = torch.device("cuda")
|
|
@@ -243,7 +303,8 @@ def run_pso(lower_boundaries: list[float],
|
|
|
243
303
|
device = torch.device("mps")
|
|
244
304
|
else:
|
|
245
305
|
device = torch.device("cpu")
|
|
246
|
-
|
|
306
|
+
|
|
307
|
+
logging.info(f"Using device: '{device}'")
|
|
247
308
|
|
|
248
309
|
# set local deep copies to prevent in place list modification
|
|
249
310
|
local_lower_boundaries = deepcopy(lower_boundaries)
|
|
@@ -271,7 +332,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
271
332
|
if target_name is None:
|
|
272
333
|
target_name = "Target"
|
|
273
334
|
|
|
274
|
-
|
|
335
|
+
pso_arguments = {
|
|
275
336
|
"func":objective_function,
|
|
276
337
|
"lb": lower,
|
|
277
338
|
"ub": upper,
|
|
@@ -281,59 +342,17 @@ def run_pso(lower_boundaries: list[float],
|
|
|
281
342
|
"particle_output": False,
|
|
282
343
|
}
|
|
283
344
|
|
|
345
|
+
# Dispatcher
|
|
346
|
+
if post_hoc_analysis is None or post_hoc_analysis <= 1:
|
|
347
|
+
features, target = _run_single_pso(objective_function, pso_arguments, names, target_name, random_state)
|
|
348
|
+
else:
|
|
349
|
+
features, target = _run_post_hoc_pso(objective_function, pso_arguments, names, target_name, post_hoc_analysis)
|
|
350
|
+
|
|
351
|
+
# --- Save Results ---
|
|
284
352
|
save_results_path = make_fullpath(save_results_dir, make=True)
|
|
353
|
+
_save_results(features, target, save_dir=save_results_path, target_name=target_name)
|
|
285
354
|
|
|
286
|
-
|
|
287
|
-
arguments.update({"seed": random_state})
|
|
288
|
-
|
|
289
|
-
best_features, best_target, *_ = _pso(**arguments)
|
|
290
|
-
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
291
|
-
|
|
292
|
-
# flip best_target if maximization was used
|
|
293
|
-
if objective_function.task == "maximization":
|
|
294
|
-
best_target = -best_target
|
|
295
|
-
|
|
296
|
-
# threshold binary features
|
|
297
|
-
best_features_threshold = threshold_binary_values(best_features, binary_number)
|
|
298
|
-
|
|
299
|
-
# name features
|
|
300
|
-
best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
|
|
301
|
-
best_target_named = {target_name: best_target}
|
|
302
|
-
|
|
303
|
-
# save results
|
|
304
|
-
_save_results(best_features_named, best_target_named, save_dir=save_results_path, target_name=target_name)
|
|
305
|
-
|
|
306
|
-
return best_features_named, best_target_named
|
|
307
|
-
else:
|
|
308
|
-
all_best_targets = list()
|
|
309
|
-
all_best_features = [[] for _ in range(size_of_features)]
|
|
310
|
-
for _ in range(post_hoc_analysis):
|
|
311
|
-
best_features, best_target, *_ = _pso(**arguments)
|
|
312
|
-
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
313
|
-
|
|
314
|
-
# flip best_target if maximization was used
|
|
315
|
-
if objective_function.task == "maximization":
|
|
316
|
-
best_target = -best_target
|
|
317
|
-
|
|
318
|
-
# threshold binary features
|
|
319
|
-
best_features_threshold = threshold_binary_values(best_features, binary_number)
|
|
320
|
-
|
|
321
|
-
for i, best_feature in enumerate(best_features_threshold):
|
|
322
|
-
all_best_features[i].append(best_feature)
|
|
323
|
-
all_best_targets.append(best_target)
|
|
324
|
-
|
|
325
|
-
# name features
|
|
326
|
-
all_best_features_named = {name: list_values for name, list_values in zip(names, all_best_features)}
|
|
327
|
-
all_best_targets_named = {target_name: all_best_targets}
|
|
328
|
-
|
|
329
|
-
# save results
|
|
330
|
-
_save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_path, target_name=target_name)
|
|
331
|
-
|
|
332
|
-
return all_best_features_named, all_best_targets_named # type: ignore
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
def info():
|
|
336
|
-
_script_info(__all__)
|
|
355
|
+
return features, target
|
|
337
356
|
|
|
338
357
|
|
|
339
358
|
def _pso(func: ObjectiveFunction,
|
|
@@ -342,7 +361,9 @@ def _pso(func: ObjectiveFunction,
|
|
|
342
361
|
device: torch.device,
|
|
343
362
|
swarmsize: int,
|
|
344
363
|
maxiter: int,
|
|
345
|
-
|
|
364
|
+
omega_start = 0.9, # STARTING inertia weight
|
|
365
|
+
omega_end = 0.4, # ENDING inertia weight
|
|
366
|
+
# omega = 0.729, # Clerc and Kennedy’s constriction coefficient
|
|
346
367
|
phip = 1.49445, # Clerc and Kennedy’s constriction coefficient
|
|
347
368
|
phig = 1.49445, # Clerc and Kennedy’s constriction coefficient
|
|
348
369
|
tolerance = 1e-8,
|
|
@@ -418,7 +439,7 @@ def _pso(func: ObjectiveFunction,
|
|
|
418
439
|
|
|
419
440
|
# Initialize positions and velocities
|
|
420
441
|
r = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
|
|
421
|
-
positions = lb_t + r * (ub_t - lb_t)
|
|
442
|
+
positions = lb_t + r * (ub_t - lb_t)
|
|
422
443
|
velocities = torch.zeros_like(positions, requires_grad=False)
|
|
423
444
|
|
|
424
445
|
# Initialize best positions and scores
|
|
@@ -428,19 +449,17 @@ def _pso(func: ObjectiveFunction,
|
|
|
428
449
|
global_best_score = float('inf')
|
|
429
450
|
global_best_position = torch.zeros(ndim, device=device, requires_grad=False)
|
|
430
451
|
|
|
431
|
-
# History (optional)
|
|
432
452
|
if particle_output:
|
|
433
453
|
history_positions = []
|
|
434
454
|
history_scores = []
|
|
435
455
|
|
|
436
|
-
# Main loop
|
|
437
456
|
previous_best_score = float('inf')
|
|
438
|
-
progress = trange(maxiter, desc="PSO", unit="iter", leave=True)
|
|
457
|
+
progress = trange(maxiter, desc="PSO", unit="iter", leave=True)
|
|
439
458
|
with torch.no_grad():
|
|
440
459
|
for i in progress:
|
|
441
460
|
# Evaluate objective for all particles
|
|
442
|
-
positions_np = positions.detach().cpu().numpy()
|
|
443
|
-
scores_np = func(positions_np)
|
|
461
|
+
positions_np = positions.detach().cpu().numpy()
|
|
462
|
+
scores_np = func(positions_np)
|
|
444
463
|
scores = torch.tensor(scores_np, device=device, dtype=torch.float32)
|
|
445
464
|
|
|
446
465
|
# Update personal bests
|
|
@@ -454,17 +473,18 @@ def _pso(func: ObjectiveFunction,
|
|
|
454
473
|
global_best_score = min_score.item()
|
|
455
474
|
global_best_position = personal_best_positions[min_idx].clone()
|
|
456
475
|
|
|
457
|
-
# Early stopping criteria
|
|
458
476
|
if abs(previous_best_score - global_best_score) < tolerance:
|
|
459
477
|
progress.set_description(f"PSO (early stop at iteration {i+1})")
|
|
460
478
|
break
|
|
461
479
|
previous_best_score = global_best_score
|
|
462
480
|
|
|
463
|
-
# Optional: track history for debugging/visualization
|
|
464
481
|
if particle_output:
|
|
465
482
|
history_positions.append(positions.detach().cpu().numpy())
|
|
466
483
|
history_scores.append(scores_np)
|
|
467
|
-
|
|
484
|
+
|
|
485
|
+
# Linearly decreasing inertia weight
|
|
486
|
+
omega = omega_start - (omega_start - omega_end) * (i / maxiter)
|
|
487
|
+
|
|
468
488
|
# Velocity update
|
|
469
489
|
rp = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
|
|
470
490
|
rg = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
|
|
@@ -476,11 +496,9 @@ def _pso(func: ObjectiveFunction,
|
|
|
476
496
|
# Position update
|
|
477
497
|
positions = positions + velocities
|
|
478
498
|
|
|
479
|
-
# Clamp to search space bounds
|
|
480
499
|
positions = torch.max(positions, lb_t)
|
|
481
500
|
positions = torch.min(positions, ub_t)
|
|
482
501
|
|
|
483
|
-
# Move to CPU and convert to NumPy
|
|
484
502
|
best_position = global_best_position.detach().cpu().numpy()
|
|
485
503
|
best_score = global_best_score
|
|
486
504
|
|
|
@@ -488,3 +506,91 @@ def _pso(func: ObjectiveFunction,
|
|
|
488
506
|
return best_position, best_score, history_positions, history_scores
|
|
489
507
|
else:
|
|
490
508
|
return best_position, best_score
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path], color_by_target: bool = True):
|
|
512
|
+
"""
|
|
513
|
+
Analyzes optimization results and plots the distribution of optimal values for each feature.
|
|
514
|
+
|
|
515
|
+
This function can operate in two modes based on the `color_by_target` parameter:
|
|
516
|
+
1. Aggregates all values for a feature into a single group and plots one overall distribution (histogram + KDE).
|
|
517
|
+
2. Color-coded: Plots a separate, color-coded Kernel Density Estimate (KDE) for each source target, allowing for direct comparison on a single chart.
|
|
518
|
+
|
|
519
|
+
Parameters
|
|
520
|
+
----------
|
|
521
|
+
results_dir : str or Path
|
|
522
|
+
The path to the directory containing the optimization result CSV files.
|
|
523
|
+
save_dir : str or Path
|
|
524
|
+
The directory where the output plots will be saved.
|
|
525
|
+
color_by_target : bool, optional
|
|
526
|
+
If True, generates comparative plots with distributions colored by their source target.
|
|
527
|
+
"""
|
|
528
|
+
mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
|
|
529
|
+
logging.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
|
|
530
|
+
|
|
531
|
+
output_path = make_fullpath(save_dir, make=True)
|
|
532
|
+
all_files = list(yield_dataframes_from_dir(results_dir))
|
|
533
|
+
|
|
534
|
+
if not all_files:
|
|
535
|
+
logging.warning("No data found. No plots will be generated.")
|
|
536
|
+
return
|
|
537
|
+
|
|
538
|
+
# --- MODE 1: Color-coded plots by target ---
|
|
539
|
+
if color_by_target:
|
|
540
|
+
data_to_plot = []
|
|
541
|
+
for df, df_name in all_files:
|
|
542
|
+
# Assumes last col is target, rest are features
|
|
543
|
+
melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
|
|
544
|
+
# Sanitize target name for cleaner legend labels
|
|
545
|
+
melted_df['target'] = df_name.replace("Optimization_", "")
|
|
546
|
+
data_to_plot.append(melted_df)
|
|
547
|
+
|
|
548
|
+
long_df = pd.concat(data_to_plot, ignore_index=True)
|
|
549
|
+
features = long_df['feature'].unique()
|
|
550
|
+
logging.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
|
|
551
|
+
|
|
552
|
+
for feature_name in features:
|
|
553
|
+
plt.figure(figsize=(12, 7))
|
|
554
|
+
feature_df = long_df[long_df['feature'] == feature_name]
|
|
555
|
+
|
|
556
|
+
sns.kdeplot(data=feature_df, x='value', hue='target', fill=True, alpha=0.1)
|
|
557
|
+
|
|
558
|
+
plt.title(f"Comparative Distribution for '{feature_name}'", fontsize=16)
|
|
559
|
+
plt.xlabel("Feature Value", fontsize=12)
|
|
560
|
+
plt.ylabel("Density", fontsize=12)
|
|
561
|
+
plt.grid(axis='y', alpha=0.5, linestyle='--')
|
|
562
|
+
plt.legend(title='Target')
|
|
563
|
+
|
|
564
|
+
sanitized_feature_name = sanitize_filename(feature_name)
|
|
565
|
+
plot_filename = output_path / f"Comparative_{sanitized_feature_name}.svg"
|
|
566
|
+
plt.savefig(plot_filename, bbox_inches='tight')
|
|
567
|
+
plt.close()
|
|
568
|
+
|
|
569
|
+
# --- MODE 2: Aggregate plot ---
|
|
570
|
+
else:
|
|
571
|
+
feature_distributions = defaultdict(list)
|
|
572
|
+
for df, _ in all_files:
|
|
573
|
+
feature_columns = df.iloc[:, :-1]
|
|
574
|
+
for feature_name in feature_columns:
|
|
575
|
+
feature_distributions[feature_name].extend(df[feature_name].tolist())
|
|
576
|
+
|
|
577
|
+
logging.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
|
|
578
|
+
for feature_name, values in feature_distributions.items():
|
|
579
|
+
plt.figure(figsize=(12, 7))
|
|
580
|
+
sns.histplot(x=values, kde=True, bins='auto', stat="density")
|
|
581
|
+
|
|
582
|
+
plt.title(f"Aggregate Distribution for '{feature_name}'", fontsize=16)
|
|
583
|
+
plt.xlabel("Feature Value", fontsize=12)
|
|
584
|
+
plt.ylabel("Density", fontsize=12)
|
|
585
|
+
plt.grid(axis='y', alpha=0.5, linestyle='--')
|
|
586
|
+
|
|
587
|
+
sanitized_feature_name = sanitize_filename(feature_name)
|
|
588
|
+
plot_filename = output_path / f"Aggregate_{sanitized_feature_name}.svg"
|
|
589
|
+
plt.savefig(plot_filename, bbox_inches='tight')
|
|
590
|
+
plt.close()
|
|
591
|
+
|
|
592
|
+
logging.info(f"✅ All plots saved successfully to: {output_path}")
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
def info():
|
|
596
|
+
_script_info(__all__)
|
ml_tools/utilities.py
CHANGED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|