dragon-ml-toolbox 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 2.2.0
3
+ Version: 2.3.0
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,21 +1,21 @@
1
- dragon_ml_toolbox-2.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-2.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
- ml_tools/ETL_engineering.py,sha256=9Lg-anXhggtdzvRPgVVSiAUGu5sb-LAZDfLDFXJlHns,21328
1
+ dragon_ml_toolbox-2.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-2.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
+ ml_tools/ETL_engineering.py,sha256=ns8HsLWZhByurvjtUUW10p7If1h1O5-btUfCRXxzkME,31568
4
4
  ml_tools/MICE_imputation.py,sha256=1fovHycZMdZ6OgVh_bk8-r3wGi4rqf6rS10LOEWYaQo,11177
5
- ml_tools/PSO_optimization.py,sha256=T-wnB94DcRWuRd2M3loDVT4POtIP0MOhs-VilAf1L4E,20974
5
+ ml_tools/PSO_optimization.py,sha256=gi56mF-q6BApYwhAd9jix0xiYz595WTPcUh7afZsRJ4,25378
6
6
  ml_tools/VIF_factor.py,sha256=lpM3Z2X_iZfXUWbCbURoeI0Tb196lU0bAsRo7q6AzBM,10235
7
7
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
9
- ml_tools/data_exploration.py,sha256=CDUVRTHfww105IXDRpBQ81KZWx5HXSsA-FVsVYBzNw8,21298
9
+ ml_tools/data_exploration.py,sha256=Fzbz_DKZ7F2e3-JbahLqKr3aP6lt9aCK9rNOHvR7nlA,23665
10
10
  ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
11
11
  ml_tools/ensemble_learning.py,sha256=q9jbu7SupvXz61sURFQ9V2-7gUsLbA3cSgyb2MQFyyc,37351
12
12
  ml_tools/handle_excel.py,sha256=Uasx-DX7RNVQSzGHVJhX7UQ9RgBbX5H1ud1Hw_y8Kp4,12944
13
13
  ml_tools/logger.py,sha256=_k7WJdpFJj3IsjOgvjLJgUFZyF8RK3Jlgp5tAu_dLQU,4767
14
14
  ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
15
15
  ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
16
- ml_tools/utilities.py,sha256=A7Wm1ArpqFG80WKmnkYdtSzIRLvg5x-9nPNidZIbpPA,20671
16
+ ml_tools/utilities.py,sha256=T6AnNEQjUDnMAMSIJ8yZqToAVESIlEKK0bGBEm3sAUU,20670
17
17
  ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
18
- dragon_ml_toolbox-2.2.0.dist-info/METADATA,sha256=oTLE1Q6BzsIwicQM7XCumt89XAjHZcV6CxDTfyteP_w,2974
19
- dragon_ml_toolbox-2.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- dragon_ml_toolbox-2.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
21
- dragon_ml_toolbox-2.2.0.dist-info/RECORD,,
18
+ dragon_ml_toolbox-2.3.0.dist-info/METADATA,sha256=4wivV_JKPd83xNzf6xzSfCwxiZgvYL5uW4yE6Da8tnU,2974
19
+ dragon_ml_toolbox-2.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
+ dragon_ml_toolbox-2.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
21
+ dragon_ml_toolbox-2.3.0.dist-info/RECORD,,
@@ -2,19 +2,120 @@ import polars as pl
2
2
  import re
3
3
  from typing import Literal, Union, Optional, Any, Callable, List, Dict
4
4
  from .utilities import _script_info
5
+ import pandas as pd
5
6
 
6
7
 
7
8
  __all__ = [
9
+ "ColumnCleaner",
10
+ "DataFrameCleaner"
8
11
  "TransformationRecipe",
9
12
  "DataProcessor",
10
13
  "KeywordDummifier",
11
14
  "NumberExtractor",
12
15
  "MultiNumberExtractor",
16
+ "RatioCalculator"
13
17
  "CategoryMapper",
18
+ "RegexMapper",
14
19
  "ValueBinner",
15
20
  "DateFeatureExtractor"
16
21
  ]
17
22
 
23
+ ########## EXTRACT and CLEAN ##########
24
+
25
+ class ColumnCleaner:
26
+ """
27
+ Cleans and standardizes a single pandas Series based on a dictionary of regex-to-value replacement rules.
28
+
29
+ Args:
30
+ rules (Dict[str, str]):
31
+ A dictionary where each key is a regular expression pattern and
32
+ each value is the standardized string to replace matches with.
33
+ """
34
+ def __init__(self, rules: Dict[str, str]):
35
+ if not isinstance(rules, dict):
36
+ raise TypeError("The 'rules' argument must be a dictionary.")
37
+
38
+ # Validate that all keys are valid regular expressions
39
+ for pattern in rules.keys():
40
+ try:
41
+ re.compile(pattern)
42
+ except re.error as e:
43
+ raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
44
+
45
+ self.rules = rules
46
+
47
+ def clean(self, series: pd.Series) -> pd.Series:
48
+ """
49
+ Applies the standardization rules to the provided Series (requires string data).
50
+
51
+ Non-matching values are kept as they are.
52
+
53
+ Args:
54
+ series (pd.Series): The pandas Series to clean.
55
+
56
+ Returns:
57
+ pd.Series: A new Series with the values cleaned and standardized.
58
+ """
59
+ return series.astype(str).replace(self.rules, regex=True)
60
+
61
+
62
+ class DataFrameCleaner:
63
+ """
64
+ Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
65
+
66
+ Args:
67
+ rules (Dict[str, Dict[str, str]]):
68
+ A nested dictionary where each top-level key is a column name,
69
+ and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
70
+ """
71
+ def __init__(self, rules: Dict[str, Dict[str, str]]):
72
+ if not isinstance(rules, dict):
73
+ raise TypeError("The 'rules' argument must be a nested dictionary.")
74
+
75
+ for col_name, col_rules in rules.items():
76
+ if not isinstance(col_rules, dict):
77
+ raise TypeError(
78
+ f"The value for column '{col_name}' must be a dictionary "
79
+ f"of rules, but got type {type(col_rules).__name__}."
80
+ )
81
+
82
+ self.rules = rules
83
+
84
+ def clean(self, df: pd.DataFrame) -> pd.DataFrame:
85
+ """
86
+ Applies all defined cleaning rules to the DataFrame.
87
+
88
+ Args:
89
+ df (pd.DataFrame): The pandas DataFrame to clean.
90
+
91
+ Returns:
92
+ pd.DataFrame: A new, cleaned DataFrame.
93
+ """
94
+ rule_columns = set(self.rules.keys())
95
+ df_columns = set(df.columns)
96
+
97
+ missing_columns = rule_columns - df_columns
98
+
99
+ if missing_columns:
100
+ # Report all missing columns in a single, clear error message
101
+ raise ValueError(
102
+ f"The following columns specified in the cleaning rules "
103
+ f"were not found in the DataFrame: {sorted(list(missing_columns))}"
104
+ )
105
+
106
+ # Start the process
107
+ df_cleaned = df.copy()
108
+
109
+ for column_name, column_rules in self.rules.items():
110
+ # Create and apply the specific cleaner for the column
111
+ cleaner = ColumnCleaner(rules=column_rules)
112
+ df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
113
+
114
+ return df_cleaned
115
+
116
+
117
+ ############ TRANSFORM ####################
118
+
18
119
  # Magic word for rename-only transformation
19
120
  _RENAME = "rename"
20
121
 
@@ -101,7 +202,7 @@ class DataProcessor:
101
202
  raise TypeError("The recipe must be an instance of TransformationRecipe.")
102
203
  if len(recipe) == 0:
103
204
  raise ValueError("The recipe cannot be empty.")
104
- self.recipe = recipe
205
+ self._recipe = recipe
105
206
 
106
207
  def transform(self, df: pl.DataFrame) -> pl.DataFrame:
107
208
  """
@@ -109,7 +210,7 @@ class DataProcessor:
109
210
  """
110
211
  processed_columns = []
111
212
  # Recipe object is iterable
112
- for step in self.recipe:
213
+ for step in self._recipe:
113
214
  input_col_name = step["input_col"]
114
215
  output_col_spec = step["output_col"]
115
216
  transform_action = step["transform"]
@@ -154,6 +255,49 @@ class DataProcessor:
154
255
  return pl.DataFrame()
155
256
 
156
257
  return pl.DataFrame(processed_columns)
258
+
259
+ def __str__(self) -> str:
260
+ """
261
+ Provides a detailed, human-readable string representation of the
262
+ entire processing pipeline.
263
+ """
264
+ header = "DataProcessor Pipeline"
265
+ divider = "-" * len(header)
266
+ num_steps = len(self._recipe)
267
+
268
+ lines = [
269
+ header,
270
+ divider,
271
+ f"Number of steps: {num_steps}\n"
272
+ ]
273
+
274
+ if num_steps == 0:
275
+ lines.append("No transformation steps defined.")
276
+ return "\n".join(lines)
277
+
278
+ for i, step in enumerate(self._recipe, 1):
279
+ transform_action = step["transform"]
280
+
281
+ # Get a clean name for the transformation action
282
+ if transform_action == _RENAME: # "rename"
283
+ transform_name = "Rename"
284
+ else:
285
+ # This works for both functions and class instances
286
+ transform_name = type(transform_action).__name__
287
+
288
+ lines.append(f"[{i}] Input: '{step['input_col']}'")
289
+ lines.append(f" - Transform: {transform_name}")
290
+ lines.append(f" - Output(s): {step['output_col']}")
291
+ if i < num_steps:
292
+ lines.append("") # Add a blank line between steps
293
+
294
+ return "\n".join(lines)
295
+
296
+ def inspect(self) -> None:
297
+ """
298
+ Prints the detailed string representation of the pipeline to the console.
299
+ """
300
+ print(self)
157
301
 
158
302
 
159
303
  class KeywordDummifier:
@@ -293,8 +437,7 @@ class MultiNumberExtractor:
293
437
  """
294
438
  Extracts multiple numbers from a single polars string column into several new columns.
295
439
 
296
- This transformer is designed for one-to-many mappings, such as parsing
297
- ratios (100:30) or coordinates (10, 25) into separate columns.
440
+ This transformer is designed for one-to-many mappings, such as parsing coordinates (10, 25) into separate columns.
298
441
 
299
442
  Args:
300
443
  num_outputs (int):
@@ -370,6 +513,59 @@ class MultiNumberExtractor:
370
513
  return pl.select(output_expressions)
371
514
 
372
515
 
516
+ class RatioCalculator:
517
+ """
518
+ A transformer that parses a string ratio (e.g., "40:5" or "30/2") and computes the result of the division.
519
+
520
+ Args:
521
+ regex_pattern (str, optional):
522
+ The regex pattern to find the numerator and denominator. It MUST
523
+ contain exactly two capturing groups: the first for the
524
+ numerator and the second for the denominator. Defaults to a
525
+ pattern that handles common delimiters like ':' and '/'.
526
+ """
527
+ def __init__(
528
+ self,
529
+ regex_pattern: str = r"(\d+\.?\d*)\s*[:/]\s*(\d+\.?\d*)"
530
+ ):
531
+ # --- Validation ---
532
+ try:
533
+ if re.compile(regex_pattern).groups != 2:
534
+ raise ValueError(
535
+ "regex_pattern must contain exactly two "
536
+ "capturing groups '(...)'."
537
+ )
538
+ except re.error as e:
539
+ raise ValueError(f"Invalid regex pattern provided: {e}") from e
540
+
541
+ self.regex_pattern = regex_pattern
542
+
543
+ def __call__(self, column: pl.Series) -> pl.Series:
544
+ """
545
+ Applies the ratio calculation logic to the input column.
546
+
547
+ Args:
548
+ column (pl.Series): The input Polars Series of ratio strings.
549
+
550
+ Returns:
551
+ pl.Series: A new Series of floats containing the division result.
552
+ Returns null for invalid formats or division by zero.
553
+ """
554
+ # .extract_groups returns a struct with a field for each capture group
555
+ # e.g., {"group_1": "40", "group_2": "5"}
556
+ groups = column.str.extract_groups(self.regex_pattern)
557
+
558
+ # Extract numerator and denominator, casting to float
559
+ # strict=False ensures that non-matches become null
560
+ numerator = groups.struct.field("group_1").cast(pl.Float64, strict=False)
561
+ denominator = groups.struct.field("group_2").cast(pl.Float64, strict=False)
562
+
563
+ # Safely perform division, returning null if denominator is 0
564
+ return pl.when(denominator != 0).then(
565
+ numerator / denominator
566
+ ).otherwise(None)
567
+
568
+
373
569
  class CategoryMapper:
374
570
  """
375
571
  A transformer that maps string categories to specified numerical values using a dictionary.
@@ -407,7 +603,90 @@ class CategoryMapper:
407
603
  pl.Series: A new Series with categories mapped to numbers.
408
604
  """
409
605
  # Ensure the column is treated as a string for matching keys
410
- return column.cast(pl.Utf8).map_dict(self.mapping, default=self.default_value)
606
+ str_column = column.cast(pl.Utf8)
607
+
608
+ # Create a list of 'when/then' expressions, one for each mapping
609
+ mapping_expressions = [
610
+ pl.when(str_column == from_val).then(pl.lit(to_val))
611
+ for from_val, to_val in self.mapping.items()
612
+ ]
613
+
614
+ # Use coalesce to find the first non-null value.
615
+ # The default_value acts as the final fallback.
616
+ final_expr = pl.coalesce(
617
+ *mapping_expressions, # Unpack the list of expressions
618
+ pl.lit(self.default_value)
619
+ )
620
+
621
+ return pl.select(final_expr).to_series()
622
+
623
+
624
+ class RegexMapper:
625
+ """
626
+ A transformer that maps string categories to numerical values based on a
627
+ dictionary of regular expression patterns.
628
+
629
+ The class iterates through the mapping dictionary in order, and the first
630
+ pattern that matches a given string determines the output value. This
631
+ "first match wins" logic makes the order of the mapping important.
632
+
633
+ Args:
634
+ mapping (Dict[str, Union[int, float]]):
635
+ An ordered dictionary where keys are regex patterns and values are
636
+ the numbers to map to if the pattern is found.
637
+ unseen_value (Optional[Union[int, float]], optional):
638
+ The numerical value to use for strings that do not match any
639
+ of the regex patterns. If None (default), unseen values are
640
+ mapped to null.
641
+ """
642
+ def __init__(
643
+ self,
644
+ mapping: Dict[str, Union[int, float]],
645
+ unseen_value: Optional[Union[int, float]] = None,
646
+ ):
647
+ # --- Validation ---
648
+ if not isinstance(mapping, dict):
649
+ raise TypeError("The 'mapping' argument must be a dictionary.")
650
+
651
+ for pattern, value in mapping.items():
652
+ try:
653
+ re.compile(pattern)
654
+ except re.error as e:
655
+ raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
656
+ if not isinstance(value, (int, float)):
657
+ raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
658
+
659
+ self.mapping = mapping
660
+ self.unseen_value = unseen_value
661
+
662
+ def __call__(self, column: pl.Series) -> pl.Series:
663
+ """
664
+ Applies the regex mapping logic to the input column.
665
+
666
+ Args:
667
+ column (pl.Series): The input Polars Series of string data.
668
+
669
+ Returns:
670
+ pl.Series: A new Series with strings mapped to numbers based on
671
+ the first matching regex pattern.
672
+ """
673
+ # Ensure the column is treated as a string for matching
674
+ str_column = column.cast(pl.Utf8)
675
+
676
+ # Build the when/then/otherwise chain from the inside out.
677
+ # Start with the final fallback value for non-matches.
678
+ mapping_expr = pl.lit(self.unseen_value)
679
+
680
+ # Iterate through the mapping in reverse to construct the nested expression
681
+ for pattern, value in reversed(list(self.mapping.items())):
682
+ mapping_expr = (
683
+ pl.when(str_column.str.contains(pattern))
684
+ .then(pl.lit(value))
685
+ .otherwise(mapping_expr)
686
+ )
687
+
688
+ # Execute the complete expression chain and return the resulting Series
689
+ return pl.select(mapping_expr).to_series()
411
690
 
412
691
 
413
692
  class ValueBinner:
@@ -7,15 +7,27 @@ from sklearn.base import ClassifierMixin
7
7
  from typing import Literal, Union, Tuple, Dict, Optional
8
8
  import pandas as pd
9
9
  from copy import deepcopy
10
- from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath
10
+ from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath, yield_dataframes_from_dir, sanitize_filename
11
11
  import torch
12
12
  from tqdm import trange
13
+ import logging
14
+ import matplotlib.pyplot as plt
15
+ import seaborn as sns
16
+ from collections import defaultdict
17
+
18
+ # Configure logger
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format="[%(asctime)s] [%(levelname)s] - %(message)s",
22
+ datefmt="%Y-%m-%d %H:%M:%S"
23
+ )
13
24
 
14
25
 
15
26
  __all__ = [
16
27
  "ObjectiveFunction",
17
28
  "multiple_objective_functions_from_dir",
18
- "run_pso"
29
+ "run_pso",
30
+ "plot_optimal_feature_distributions"
19
31
  ]
20
32
 
21
33
 
@@ -184,6 +196,52 @@ def _save_results(*dicts, save_dir: Union[str,Path], target_name: str):
184
196
  save_dataframe(df=df, save_dir=save_dir, filename=f"Optimization_{target_name}")
185
197
 
186
198
 
199
+ def _run_single_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, random_state: int):
200
+ """Helper for a single PSO run."""
201
+ pso_args.update({"seed": random_state})
202
+
203
+ best_features, best_target, *_ = _pso(**pso_args)
204
+
205
+ # Flip best_target if maximization was used
206
+ if objective_function.task == "maximization":
207
+ best_target = -best_target
208
+
209
+ # Threshold binary features
210
+ binary_number = objective_function.binary_features
211
+ best_features_threshold = threshold_binary_values(best_features, binary_number)
212
+
213
+ # Name features and target
214
+ best_features_named = {name: value for name, value in zip(feature_names, best_features_threshold)}
215
+ best_target_named = {target_name: best_target}
216
+
217
+ return best_features_named, best_target_named
218
+
219
+
220
+ def _run_post_hoc_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, repetitions: int):
221
+ """Helper for post-hoc PSO analysis."""
222
+ all_best_targets = []
223
+ all_best_features = [[] for _ in range(len(feature_names))]
224
+
225
+ for _ in range(repetitions):
226
+ best_features, best_target, *_ = _pso(**pso_args)
227
+
228
+ if objective_function.task == "maximization":
229
+ best_target = -best_target
230
+
231
+ binary_number = objective_function.binary_features
232
+ best_features_threshold = threshold_binary_values(best_features, binary_number)
233
+
234
+ for i, best_feature in enumerate(best_features_threshold):
235
+ all_best_features[i].append(best_feature)
236
+ all_best_targets.append(best_target)
237
+
238
+ # Name features and target
239
+ all_best_features_named = {name: lst for name, lst in zip(feature_names, all_best_features)}
240
+ all_best_targets_named = {target_name: all_best_targets}
241
+
242
+ return all_best_features_named, all_best_targets_named
243
+
244
+
187
245
  def run_pso(lower_boundaries: list[float],
188
246
  upper_boundaries: list[float],
189
247
  objective_function: ObjectiveFunction,
@@ -236,6 +294,8 @@ def run_pso(lower_boundaries: list[float],
236
294
  -----
237
295
  - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
238
296
  """
297
+
298
+
239
299
  # Select device
240
300
  if torch.cuda.is_available():
241
301
  device = torch.device("cuda")
@@ -243,7 +303,8 @@ def run_pso(lower_boundaries: list[float],
243
303
  device = torch.device("mps")
244
304
  else:
245
305
  device = torch.device("cpu")
246
- print(f"[PSO] Using device: '{device}'")
306
+
307
+ logging.info(f"Using device: '{device}'")
247
308
 
248
309
  # set local deep copies to prevent in place list modification
249
310
  local_lower_boundaries = deepcopy(lower_boundaries)
@@ -271,7 +332,7 @@ def run_pso(lower_boundaries: list[float],
271
332
  if target_name is None:
272
333
  target_name = "Target"
273
334
 
274
- arguments = {
335
+ pso_arguments = {
275
336
  "func":objective_function,
276
337
  "lb": lower,
277
338
  "ub": upper,
@@ -281,59 +342,17 @@ def run_pso(lower_boundaries: list[float],
281
342
  "particle_output": False,
282
343
  }
283
344
 
345
+ # Dispatcher
346
+ if post_hoc_analysis is None or post_hoc_analysis <= 1:
347
+ features, target = _run_single_pso(objective_function, pso_arguments, names, target_name, random_state)
348
+ else:
349
+ features, target = _run_post_hoc_pso(objective_function, pso_arguments, names, target_name, post_hoc_analysis)
350
+
351
+ # --- Save Results ---
284
352
  save_results_path = make_fullpath(save_results_dir, make=True)
353
+ _save_results(features, target, save_dir=save_results_path, target_name=target_name)
285
354
 
286
- if post_hoc_analysis is None or post_hoc_analysis == 1:
287
- arguments.update({"seed": random_state})
288
-
289
- best_features, best_target, *_ = _pso(**arguments)
290
- # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
291
-
292
- # flip best_target if maximization was used
293
- if objective_function.task == "maximization":
294
- best_target = -best_target
295
-
296
- # threshold binary features
297
- best_features_threshold = threshold_binary_values(best_features, binary_number)
298
-
299
- # name features
300
- best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
301
- best_target_named = {target_name: best_target}
302
-
303
- # save results
304
- _save_results(best_features_named, best_target_named, save_dir=save_results_path, target_name=target_name)
305
-
306
- return best_features_named, best_target_named
307
- else:
308
- all_best_targets = list()
309
- all_best_features = [[] for _ in range(size_of_features)]
310
- for _ in range(post_hoc_analysis):
311
- best_features, best_target, *_ = _pso(**arguments)
312
- # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
313
-
314
- # flip best_target if maximization was used
315
- if objective_function.task == "maximization":
316
- best_target = -best_target
317
-
318
- # threshold binary features
319
- best_features_threshold = threshold_binary_values(best_features, binary_number)
320
-
321
- for i, best_feature in enumerate(best_features_threshold):
322
- all_best_features[i].append(best_feature)
323
- all_best_targets.append(best_target)
324
-
325
- # name features
326
- all_best_features_named = {name: list_values for name, list_values in zip(names, all_best_features)}
327
- all_best_targets_named = {target_name: all_best_targets}
328
-
329
- # save results
330
- _save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_path, target_name=target_name)
331
-
332
- return all_best_features_named, all_best_targets_named # type: ignore
333
-
334
-
335
- def info():
336
- _script_info(__all__)
355
+ return features, target
337
356
 
338
357
 
339
358
  def _pso(func: ObjectiveFunction,
@@ -342,7 +361,9 @@ def _pso(func: ObjectiveFunction,
342
361
  device: torch.device,
343
362
  swarmsize: int,
344
363
  maxiter: int,
345
- omega = 0.729, # Clerc and Kennedy’s constriction coefficient
364
+ omega_start = 0.9, # STARTING inertia weight
365
+ omega_end = 0.4, # ENDING inertia weight
366
+ # omega = 0.729, # Clerc and Kennedy’s constriction coefficient
346
367
  phip = 1.49445, # Clerc and Kennedy’s constriction coefficient
347
368
  phig = 1.49445, # Clerc and Kennedy’s constriction coefficient
348
369
  tolerance = 1e-8,
@@ -418,7 +439,7 @@ def _pso(func: ObjectiveFunction,
418
439
 
419
440
  # Initialize positions and velocities
420
441
  r = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
421
- positions = lb_t + r * (ub_t - lb_t) # shape: (swarmsize, ndim)
442
+ positions = lb_t + r * (ub_t - lb_t)
422
443
  velocities = torch.zeros_like(positions, requires_grad=False)
423
444
 
424
445
  # Initialize best positions and scores
@@ -428,19 +449,17 @@ def _pso(func: ObjectiveFunction,
428
449
  global_best_score = float('inf')
429
450
  global_best_position = torch.zeros(ndim, device=device, requires_grad=False)
430
451
 
431
- # History (optional)
432
452
  if particle_output:
433
453
  history_positions = []
434
454
  history_scores = []
435
455
 
436
- # Main loop
437
456
  previous_best_score = float('inf')
438
- progress = trange(maxiter, desc="PSO", unit="iter", leave=True) #tqdm bar
457
+ progress = trange(maxiter, desc="PSO", unit="iter", leave=True)
439
458
  with torch.no_grad():
440
459
  for i in progress:
441
460
  # Evaluate objective for all particles
442
- positions_np = positions.detach().cpu().numpy() # shape: (swarmsize, n_features)
443
- scores_np = func(positions_np) # shape: (swarmsize,)
461
+ positions_np = positions.detach().cpu().numpy()
462
+ scores_np = func(positions_np)
444
463
  scores = torch.tensor(scores_np, device=device, dtype=torch.float32)
445
464
 
446
465
  # Update personal bests
@@ -454,17 +473,18 @@ def _pso(func: ObjectiveFunction,
454
473
  global_best_score = min_score.item()
455
474
  global_best_position = personal_best_positions[min_idx].clone()
456
475
 
457
- # Early stopping criteria
458
476
  if abs(previous_best_score - global_best_score) < tolerance:
459
477
  progress.set_description(f"PSO (early stop at iteration {i+1})")
460
478
  break
461
479
  previous_best_score = global_best_score
462
480
 
463
- # Optional: track history for debugging/visualization
464
481
  if particle_output:
465
482
  history_positions.append(positions.detach().cpu().numpy())
466
483
  history_scores.append(scores_np)
467
-
484
+
485
+ # Linearly decreasing inertia weight
486
+ omega = omega_start - (omega_start - omega_end) * (i / maxiter)
487
+
468
488
  # Velocity update
469
489
  rp = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
470
490
  rg = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
@@ -476,11 +496,9 @@ def _pso(func: ObjectiveFunction,
476
496
  # Position update
477
497
  positions = positions + velocities
478
498
 
479
- # Clamp to search space bounds
480
499
  positions = torch.max(positions, lb_t)
481
500
  positions = torch.min(positions, ub_t)
482
501
 
483
- # Move to CPU and convert to NumPy
484
502
  best_position = global_best_position.detach().cpu().numpy()
485
503
  best_score = global_best_score
486
504
 
@@ -488,3 +506,91 @@ def _pso(func: ObjectiveFunction,
488
506
  return best_position, best_score, history_positions, history_scores
489
507
  else:
490
508
  return best_position, best_score
509
+
510
+
511
+ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path], color_by_target: bool = True):
512
+ """
513
+ Analyzes optimization results and plots the distribution of optimal values for each feature.
514
+
515
+ This function can operate in two modes based on the `color_by_target` parameter:
516
+ 1. Aggregates all values for a feature into a single group and plots one overall distribution (histogram + KDE).
517
+ 2. Color-coded: Plots a separate, color-coded Kernel Density Estimate (KDE) for each source target, allowing for direct comparison on a single chart.
518
+
519
+ Parameters
520
+ ----------
521
+ results_dir : str or Path
522
+ The path to the directory containing the optimization result CSV files.
523
+ save_dir : str or Path
524
+ The directory where the output plots will be saved.
525
+ color_by_target : bool, optional
526
+ If True, generates comparative plots with distributions colored by their source target.
527
+ """
528
+ mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
529
+ logging.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
530
+
531
+ output_path = make_fullpath(save_dir, make=True)
532
+ all_files = list(yield_dataframes_from_dir(results_dir))
533
+
534
+ if not all_files:
535
+ logging.warning("No data found. No plots will be generated.")
536
+ return
537
+
538
+ # --- MODE 1: Color-coded plots by target ---
539
+ if color_by_target:
540
+ data_to_plot = []
541
+ for df, df_name in all_files:
542
+ # Assumes last col is target, rest are features
543
+ melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
544
+ # Sanitize target name for cleaner legend labels
545
+ melted_df['target'] = df_name.replace("Optimization_", "")
546
+ data_to_plot.append(melted_df)
547
+
548
+ long_df = pd.concat(data_to_plot, ignore_index=True)
549
+ features = long_df['feature'].unique()
550
+ logging.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
551
+
552
+ for feature_name in features:
553
+ plt.figure(figsize=(12, 7))
554
+ feature_df = long_df[long_df['feature'] == feature_name]
555
+
556
+ sns.kdeplot(data=feature_df, x='value', hue='target', fill=True, alpha=0.1)
557
+
558
+ plt.title(f"Comparative Distribution for '{feature_name}'", fontsize=16)
559
+ plt.xlabel("Feature Value", fontsize=12)
560
+ plt.ylabel("Density", fontsize=12)
561
+ plt.grid(axis='y', alpha=0.5, linestyle='--')
562
+ plt.legend(title='Target')
563
+
564
+ sanitized_feature_name = sanitize_filename(feature_name)
565
+ plot_filename = output_path / f"Comparative_{sanitized_feature_name}.svg"
566
+ plt.savefig(plot_filename, bbox_inches='tight')
567
+ plt.close()
568
+
569
+ # --- MODE 2: Aggregate plot ---
570
+ else:
571
+ feature_distributions = defaultdict(list)
572
+ for df, _ in all_files:
573
+ feature_columns = df.iloc[:, :-1]
574
+ for feature_name in feature_columns:
575
+ feature_distributions[feature_name].extend(df[feature_name].tolist())
576
+
577
+ logging.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
578
+ for feature_name, values in feature_distributions.items():
579
+ plt.figure(figsize=(12, 7))
580
+ sns.histplot(x=values, kde=True, bins='auto', stat="density")
581
+
582
+ plt.title(f"Aggregate Distribution for '{feature_name}'", fontsize=16)
583
+ plt.xlabel("Feature Value", fontsize=12)
584
+ plt.ylabel("Density", fontsize=12)
585
+ plt.grid(axis='y', alpha=0.5, linestyle='--')
586
+
587
+ sanitized_feature_name = sanitize_filename(feature_name)
588
+ plot_filename = output_path / f"Aggregate_{sanitized_feature_name}.svg"
589
+ plt.savefig(plot_filename, bbox_inches='tight')
590
+ plt.close()
591
+
592
+ logging.info(f"✅ All plots saved successfully to: {output_path}")
593
+
594
+
595
+ def info():
596
+ _script_info(__all__)
@@ -1,4 +1,5 @@
1
1
  import pandas as pd
2
+ from pandas.api.types import is_numeric_dtype
2
3
  import numpy as np
3
4
  import matplotlib.pyplot as plt
4
5
  import seaborn as sns
@@ -24,7 +25,8 @@ __all__ = [
24
25
  "plot_value_distributions",
25
26
  "clip_outliers_single",
26
27
  "clip_outliers_multi",
27
- "match_and_filter_columns_by_regex"
28
+ "match_and_filter_columns_by_regex",
29
+ "standardize_percentages"
28
30
  ]
29
31
 
30
32
 
@@ -575,6 +577,72 @@ def match_and_filter_columns_by_regex(
575
577
  return filtered_df, matched_columns
576
578
 
577
579
 
580
+ def standardize_percentages(
581
+ df: pd.DataFrame,
582
+ columns: list[str],
583
+ treat_one_as_proportion: bool = True,
584
+ round_digits: int = 2
585
+ ) -> pd.DataFrame:
586
+ """
587
+ Standardizes numeric columns containing mixed-format percentages.
588
+
589
+ This function cleans columns where percentages might be entered as whole
590
+ numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
591
+ between 0 and 1 are proportions and multiplies them by 100.
592
+
593
+ Args:
594
+ df (pd.Dataframe): The input pandas DataFrame.
595
+ columns (list[str]): A list of column names to standardize.
596
+ treat_one_as_proportion (bool):
597
+ - If True (default): The value `1` is treated as a proportion and converted to `100`.
598
+ - If False: The value `1` is treated as `1%`.
599
+ round_digits (int): The number of decimal places to round the final result to.
600
+
601
+ Returns:
602
+ (pd.Dataframe):
603
+ A new DataFrame with the specified columns cleaned and standardized.
604
+ """
605
+ df_copy = df.copy()
606
+
607
+ if df_copy.empty:
608
+ return df_copy
609
+
610
+ # This helper function contains the core cleaning logic
611
+ def _clean_value(x: float) -> float:
612
+ """Applies the standardization rule to a single value."""
613
+ if pd.isna(x):
614
+ return x
615
+
616
+ # If treat_one_as_proportion is True, the range for proportions is [0, 1]
617
+ if treat_one_as_proportion and 0 <= x <= 1:
618
+ return x * 100
619
+ # If False, the range for proportions is [0, 1) (1 is excluded)
620
+ elif not treat_one_as_proportion and 0 <= x < 1:
621
+ return x * 100
622
+
623
+ # Otherwise, the value is assumed to be a correctly formatted percentage
624
+ return x
625
+
626
+ for col in columns:
627
+ # --- Robustness Checks ---
628
+ if col not in df_copy.columns:
629
+ print(f"Warning: Column '{col}' not found. Skipping.")
630
+ continue
631
+
632
+ if not is_numeric_dtype(df_copy[col]):
633
+ print(f"Warning: Column '{col}' is not numeric. Skipping.")
634
+ continue
635
+
636
+ # --- Applying the Logic ---
637
+ # Apply the cleaning function to every value in the column
638
+ df_copy[col] = df_copy[col].apply(_clean_value)
639
+
640
+ # Round the result
641
+ df_copy[col] = df_copy[col].round(round_digits)
642
+
643
+ return df_copy
644
+
645
+
578
646
  def _is_notebook():
579
647
  return get_ipython() is not None
580
648
 
ml_tools/utilities.py CHANGED
@@ -86,7 +86,6 @@ def make_fullpath(
86
86
  return resolved
87
87
 
88
88
 
89
-
90
89
  def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
91
90
  """
92
91
  Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.