dragon-ml-toolbox 2.2.1__tar.gz → 2.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (27) hide show
  1. {dragon_ml_toolbox-2.2.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-2.4.0}/PKG-INFO +20 -6
  2. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/README.md +17 -5
  3. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0/dragon_ml_toolbox.egg-info}/PKG-INFO +20 -6
  4. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +1 -0
  5. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/dragon_ml_toolbox.egg-info/requires.txt +3 -0
  6. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/ETL_engineering.py +223 -2
  7. dragon_ml_toolbox-2.4.0/ml_tools/GUI_tools.py +496 -0
  8. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/PSO_optimization.py +173 -67
  9. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/utilities.py +0 -1
  10. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/pyproject.toml +5 -1
  11. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/LICENSE +0 -0
  12. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/LICENSE-THIRD-PARTY.md +0 -0
  13. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  14. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  15. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/MICE_imputation.py +0 -0
  16. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/VIF_factor.py +0 -0
  17. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/__init__.py +0 -0
  18. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/_particle_swarm_optimization.py +0 -0
  19. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/data_exploration.py +0 -0
  20. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/datasetmaster.py +0 -0
  21. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/ensemble_learning.py +0 -0
  22. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/handle_excel.py +0 -0
  23. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/logger.py +0 -0
  24. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/pytorch_models.py +0 -0
  25. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/trainer.py +0 -0
  26. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/vision_helpers.py +0 -0
  27. {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 2.2.1
3
+ Version: 2.4.0
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -37,9 +37,11 @@ Requires-Dist: Pillow
37
37
  Provides-Extra: pytorch
38
38
  Requires-Dist: torch; extra == "pytorch"
39
39
  Requires-Dist: torchvision; extra == "pytorch"
40
+ Provides-Extra: gui
41
+ Requires-Dist: FreeSimpleGUI>=5.2; extra == "gui"
40
42
  Dynamic: license-file
41
43
 
42
- # dragon-ml-tools
44
+ # dragon-ml-toolbox
43
45
 
44
46
  A collection of Python utilities for data science and machine learning, structured as a modular package for easy reuse and installation.
45
47
 
@@ -57,7 +59,7 @@ A collection of Python utilities for data science and machine learning, structur
57
59
  Install the latest stable release from PyPI:
58
60
 
59
61
  ```bash
60
- pip install dragon-ml-tools
62
+ pip install dragon-ml-toolbox
61
63
  ```
62
64
 
63
65
  ### Via GitHub (Editable)
@@ -77,16 +79,26 @@ Install from the conda-forge channel:
77
79
  ```bash
78
80
  conda install -c conda-forge dragon-ml-toolbox
79
81
  ```
80
- **Note:** This version is outdated or broken due to dependency incompatibilities.
82
+ **Note:** This version is outdated or broken due to dependency incompatibilities. Use PyPi instead.
81
83
 
82
84
  ## Optional dependencies
83
85
 
84
- **PyTorch**, which provides different builds depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
86
+ ### FreeSimpleGUI
87
+
88
+ Wrapper library used to build powerful GUIs. Requires the tkinter backend.
89
+
90
+ ```bash
91
+ pip install dragon-ml-toolbox[gui]
92
+ ```
93
+
94
+ ### PyTorch
95
+
96
+ Different builds available depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
85
97
 
86
98
  Install the default CPU-only version with
87
99
 
88
100
  ```bash
89
- pip install dragon-ml-tools[pytorch]
101
+ pip install dragon-ml-toolbox[pytorch]
90
102
  ```
91
103
 
92
104
  To make use of GPU acceleration use the official PyTorch installation instructions:
@@ -108,6 +120,8 @@ from ml_tools.logger import custom_logger
108
120
  data_exploration
109
121
  datasetmaster
110
122
  ensemble_learning
123
+ ETL_engineering
124
+ GUI_tools
111
125
  handle_excel
112
126
  logger
113
127
  MICE_imputation
@@ -1,4 +1,4 @@
1
- # dragon-ml-tools
1
+ # dragon-ml-toolbox
2
2
 
3
3
  A collection of Python utilities for data science and machine learning, structured as a modular package for easy reuse and installation.
4
4
 
@@ -16,7 +16,7 @@ A collection of Python utilities for data science and machine learning, structur
16
16
  Install the latest stable release from PyPI:
17
17
 
18
18
  ```bash
19
- pip install dragon-ml-tools
19
+ pip install dragon-ml-toolbox
20
20
  ```
21
21
 
22
22
  ### Via GitHub (Editable)
@@ -36,16 +36,26 @@ Install from the conda-forge channel:
36
36
  ```bash
37
37
  conda install -c conda-forge dragon-ml-toolbox
38
38
  ```
39
- **Note:** This version is outdated or broken due to dependency incompatibilities.
39
+ **Note:** This version is outdated or broken due to dependency incompatibilities. Use PyPi instead.
40
40
 
41
41
  ## Optional dependencies
42
42
 
43
- **PyTorch**, which provides different builds depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
43
+ ### FreeSimpleGUI
44
+
45
+ Wrapper library used to build powerful GUIs. Requires the tkinter backend.
46
+
47
+ ```bash
48
+ pip install dragon-ml-toolbox[gui]
49
+ ```
50
+
51
+ ### PyTorch
52
+
53
+ Different builds available depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
44
54
 
45
55
  Install the default CPU-only version with
46
56
 
47
57
  ```bash
48
- pip install dragon-ml-tools[pytorch]
58
+ pip install dragon-ml-toolbox[pytorch]
49
59
  ```
50
60
 
51
61
  To make use of GPU acceleration use the official PyTorch installation instructions:
@@ -67,6 +77,8 @@ from ml_tools.logger import custom_logger
67
77
  data_exploration
68
78
  datasetmaster
69
79
  ensemble_learning
80
+ ETL_engineering
81
+ GUI_tools
70
82
  handle_excel
71
83
  logger
72
84
  MICE_imputation
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 2.2.1
3
+ Version: 2.4.0
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -37,9 +37,11 @@ Requires-Dist: Pillow
37
37
  Provides-Extra: pytorch
38
38
  Requires-Dist: torch; extra == "pytorch"
39
39
  Requires-Dist: torchvision; extra == "pytorch"
40
+ Provides-Extra: gui
41
+ Requires-Dist: FreeSimpleGUI>=5.2; extra == "gui"
40
42
  Dynamic: license-file
41
43
 
42
- # dragon-ml-tools
44
+ # dragon-ml-toolbox
43
45
 
44
46
  A collection of Python utilities for data science and machine learning, structured as a modular package for easy reuse and installation.
45
47
 
@@ -57,7 +59,7 @@ A collection of Python utilities for data science and machine learning, structur
57
59
  Install the latest stable release from PyPI:
58
60
 
59
61
  ```bash
60
- pip install dragon-ml-tools
62
+ pip install dragon-ml-toolbox
61
63
  ```
62
64
 
63
65
  ### Via GitHub (Editable)
@@ -77,16 +79,26 @@ Install from the conda-forge channel:
77
79
  ```bash
78
80
  conda install -c conda-forge dragon-ml-toolbox
79
81
  ```
80
- **Note:** This version is outdated or broken due to dependency incompatibilities.
82
+ **Note:** This version is outdated or broken due to dependency incompatibilities. Use PyPi instead.
81
83
 
82
84
  ## Optional dependencies
83
85
 
84
- **PyTorch**, which provides different builds depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
86
+ ### FreeSimpleGUI
87
+
88
+ Wrapper library used to build powerful GUIs. Requires the tkinter backend.
89
+
90
+ ```bash
91
+ pip install dragon-ml-toolbox[gui]
92
+ ```
93
+
94
+ ### PyTorch
95
+
96
+ Different builds available depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
85
97
 
86
98
  Install the default CPU-only version with
87
99
 
88
100
  ```bash
89
- pip install dragon-ml-tools[pytorch]
101
+ pip install dragon-ml-toolbox[pytorch]
90
102
  ```
91
103
 
92
104
  To make use of GPU acceleration use the official PyTorch installation instructions:
@@ -108,6 +120,8 @@ from ml_tools.logger import custom_logger
108
120
  data_exploration
109
121
  datasetmaster
110
122
  ensemble_learning
123
+ ETL_engineering
124
+ GUI_tools
111
125
  handle_excel
112
126
  logger
113
127
  MICE_imputation
@@ -8,6 +8,7 @@ dragon_ml_toolbox.egg-info/dependency_links.txt
8
8
  dragon_ml_toolbox.egg-info/requires.txt
9
9
  dragon_ml_toolbox.egg-info/top_level.txt
10
10
  ml_tools/ETL_engineering.py
11
+ ml_tools/GUI_tools.py
11
12
  ml_tools/MICE_imputation.py
12
13
  ml_tools/PSO_optimization.py
13
14
  ml_tools/VIF_factor.py
@@ -21,6 +21,9 @@ shap
21
21
  tqdm>=4.0
22
22
  Pillow
23
23
 
24
+ [gui]
25
+ FreeSimpleGUI>=5.2
26
+
24
27
  [pytorch]
25
28
  torch
26
29
  torchvision
@@ -2,19 +2,120 @@ import polars as pl
2
2
  import re
3
3
  from typing import Literal, Union, Optional, Any, Callable, List, Dict
4
4
  from .utilities import _script_info
5
+ import pandas as pd
5
6
 
6
7
 
7
8
  __all__ = [
9
+ "ColumnCleaner",
10
+ "DataFrameCleaner"
8
11
  "TransformationRecipe",
9
12
  "DataProcessor",
10
13
  "KeywordDummifier",
11
14
  "NumberExtractor",
12
15
  "MultiNumberExtractor",
16
+ "RatioCalculator"
13
17
  "CategoryMapper",
18
+ "RegexMapper",
14
19
  "ValueBinner",
15
20
  "DateFeatureExtractor"
16
21
  ]
17
22
 
23
+ ########## EXTRACT and CLEAN ##########
24
+
25
+ class ColumnCleaner:
26
+ """
27
+ Cleans and standardizes a single pandas Series based on a dictionary of regex-to-value replacement rules.
28
+
29
+ Args:
30
+ rules (Dict[str, str]):
31
+ A dictionary where each key is a regular expression pattern and
32
+ each value is the standardized string to replace matches with.
33
+ """
34
+ def __init__(self, rules: Dict[str, str]):
35
+ if not isinstance(rules, dict):
36
+ raise TypeError("The 'rules' argument must be a dictionary.")
37
+
38
+ # Validate that all keys are valid regular expressions
39
+ for pattern in rules.keys():
40
+ try:
41
+ re.compile(pattern)
42
+ except re.error as e:
43
+ raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
44
+
45
+ self.rules = rules
46
+
47
+ def clean(self, series: pd.Series) -> pd.Series:
48
+ """
49
+ Applies the standardization rules to the provided Series (requires string data).
50
+
51
+ Non-matching values are kept as they are.
52
+
53
+ Args:
54
+ series (pd.Series): The pandas Series to clean.
55
+
56
+ Returns:
57
+ pd.Series: A new Series with the values cleaned and standardized.
58
+ """
59
+ return series.astype(str).replace(self.rules, regex=True)
60
+
61
+
62
+ class DataFrameCleaner:
63
+ """
64
+ Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
65
+
66
+ Args:
67
+ rules (Dict[str, Dict[str, str]]):
68
+ A nested dictionary where each top-level key is a column name,
69
+ and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
70
+ """
71
+ def __init__(self, rules: Dict[str, Dict[str, str]]):
72
+ if not isinstance(rules, dict):
73
+ raise TypeError("The 'rules' argument must be a nested dictionary.")
74
+
75
+ for col_name, col_rules in rules.items():
76
+ if not isinstance(col_rules, dict):
77
+ raise TypeError(
78
+ f"The value for column '{col_name}' must be a dictionary "
79
+ f"of rules, but got type {type(col_rules).__name__}."
80
+ )
81
+
82
+ self.rules = rules
83
+
84
+ def clean(self, df: pd.DataFrame) -> pd.DataFrame:
85
+ """
86
+ Applies all defined cleaning rules to the DataFrame.
87
+
88
+ Args:
89
+ df (pd.DataFrame): The pandas DataFrame to clean.
90
+
91
+ Returns:
92
+ pd.DataFrame: A new, cleaned DataFrame.
93
+ """
94
+ rule_columns = set(self.rules.keys())
95
+ df_columns = set(df.columns)
96
+
97
+ missing_columns = rule_columns - df_columns
98
+
99
+ if missing_columns:
100
+ # Report all missing columns in a single, clear error message
101
+ raise ValueError(
102
+ f"The following columns specified in the cleaning rules "
103
+ f"were not found in the DataFrame: {sorted(list(missing_columns))}"
104
+ )
105
+
106
+ # Start the process
107
+ df_cleaned = df.copy()
108
+
109
+ for column_name, column_rules in self.rules.items():
110
+ # Create and apply the specific cleaner for the column
111
+ cleaner = ColumnCleaner(rules=column_rules)
112
+ df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
113
+
114
+ return df_cleaned
115
+
116
+
117
+ ############ TRANSFORM ####################
118
+
18
119
  # Magic word for rename-only transformation
19
120
  _RENAME = "rename"
20
121
 
@@ -336,8 +437,7 @@ class MultiNumberExtractor:
336
437
  """
337
438
  Extracts multiple numbers from a single polars string column into several new columns.
338
439
 
339
- This transformer is designed for one-to-many mappings, such as parsing
340
- ratios (100:30) or coordinates (10, 25) into separate columns.
440
+ This transformer is designed for one-to-many mappings, such as parsing coordinates (10, 25) into separate columns.
341
441
 
342
442
  Args:
343
443
  num_outputs (int):
@@ -413,6 +513,59 @@ class MultiNumberExtractor:
413
513
  return pl.select(output_expressions)
414
514
 
415
515
 
516
+ class RatioCalculator:
517
+ """
518
+ A transformer that parses a string ratio (e.g., "40:5" or "30/2") and computes the result of the division.
519
+
520
+ Args:
521
+ regex_pattern (str, optional):
522
+ The regex pattern to find the numerator and denominator. It MUST
523
+ contain exactly two capturing groups: the first for the
524
+ numerator and the second for the denominator. Defaults to a
525
+ pattern that handles common delimiters like ':' and '/'.
526
+ """
527
+ def __init__(
528
+ self,
529
+ regex_pattern: str = r"(\d+\.?\d*)\s*[:/]\s*(\d+\.?\d*)"
530
+ ):
531
+ # --- Validation ---
532
+ try:
533
+ if re.compile(regex_pattern).groups != 2:
534
+ raise ValueError(
535
+ "regex_pattern must contain exactly two "
536
+ "capturing groups '(...)'."
537
+ )
538
+ except re.error as e:
539
+ raise ValueError(f"Invalid regex pattern provided: {e}") from e
540
+
541
+ self.regex_pattern = regex_pattern
542
+
543
+ def __call__(self, column: pl.Series) -> pl.Series:
544
+ """
545
+ Applies the ratio calculation logic to the input column.
546
+
547
+ Args:
548
+ column (pl.Series): The input Polars Series of ratio strings.
549
+
550
+ Returns:
551
+ pl.Series: A new Series of floats containing the division result.
552
+ Returns null for invalid formats or division by zero.
553
+ """
554
+ # .extract_groups returns a struct with a field for each capture group
555
+ # e.g., {"group_1": "40", "group_2": "5"}
556
+ groups = column.str.extract_groups(self.regex_pattern)
557
+
558
+ # Extract numerator and denominator, casting to float
559
+ # strict=False ensures that non-matches become null
560
+ numerator = groups.struct.field("group_1").cast(pl.Float64, strict=False)
561
+ denominator = groups.struct.field("group_2").cast(pl.Float64, strict=False)
562
+
563
+ # Safely perform division, returning null if denominator is 0
564
+ return pl.when(denominator != 0).then(
565
+ numerator / denominator
566
+ ).otherwise(None)
567
+
568
+
416
569
  class CategoryMapper:
417
570
  """
418
571
  A transformer that maps string categories to specified numerical values using a dictionary.
@@ -468,6 +621,74 @@ class CategoryMapper:
468
621
  return pl.select(final_expr).to_series()
469
622
 
470
623
 
624
+ class RegexMapper:
625
+ """
626
+ A transformer that maps string categories to numerical values based on a
627
+ dictionary of regular expression patterns.
628
+
629
+ The class iterates through the mapping dictionary in order, and the first
630
+ pattern that matches a given string determines the output value. This
631
+ "first match wins" logic makes the order of the mapping important.
632
+
633
+ Args:
634
+ mapping (Dict[str, Union[int, float]]):
635
+ An ordered dictionary where keys are regex patterns and values are
636
+ the numbers to map to if the pattern is found.
637
+ unseen_value (Optional[Union[int, float]], optional):
638
+ The numerical value to use for strings that do not match any
639
+ of the regex patterns. If None (default), unseen values are
640
+ mapped to null.
641
+ """
642
+ def __init__(
643
+ self,
644
+ mapping: Dict[str, Union[int, float]],
645
+ unseen_value: Optional[Union[int, float]] = None,
646
+ ):
647
+ # --- Validation ---
648
+ if not isinstance(mapping, dict):
649
+ raise TypeError("The 'mapping' argument must be a dictionary.")
650
+
651
+ for pattern, value in mapping.items():
652
+ try:
653
+ re.compile(pattern)
654
+ except re.error as e:
655
+ raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
656
+ if not isinstance(value, (int, float)):
657
+ raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
658
+
659
+ self.mapping = mapping
660
+ self.unseen_value = unseen_value
661
+
662
+ def __call__(self, column: pl.Series) -> pl.Series:
663
+ """
664
+ Applies the regex mapping logic to the input column.
665
+
666
+ Args:
667
+ column (pl.Series): The input Polars Series of string data.
668
+
669
+ Returns:
670
+ pl.Series: A new Series with strings mapped to numbers based on
671
+ the first matching regex pattern.
672
+ """
673
+ # Ensure the column is treated as a string for matching
674
+ str_column = column.cast(pl.Utf8)
675
+
676
+ # Build the when/then/otherwise chain from the inside out.
677
+ # Start with the final fallback value for non-matches.
678
+ mapping_expr = pl.lit(self.unseen_value)
679
+
680
+ # Iterate through the mapping in reverse to construct the nested expression
681
+ for pattern, value in reversed(list(self.mapping.items())):
682
+ mapping_expr = (
683
+ pl.when(str_column.str.contains(pattern))
684
+ .then(pl.lit(value))
685
+ .otherwise(mapping_expr)
686
+ )
687
+
688
+ # Execute the complete expression chain and return the resulting Series
689
+ return pl.select(mapping_expr).to_series()
690
+
691
+
471
692
  class ValueBinner:
472
693
  """
473
694
  A transformer that discretizes a continuous numerical column into a finite number of bins.