dragon-ml-toolbox 2.2.1__tar.gz → 2.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-2.2.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-2.4.0}/PKG-INFO +20 -6
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/README.md +17 -5
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0/dragon_ml_toolbox.egg-info}/PKG-INFO +20 -6
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +1 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/dragon_ml_toolbox.egg-info/requires.txt +3 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/ETL_engineering.py +223 -2
- dragon_ml_toolbox-2.4.0/ml_tools/GUI_tools.py +496 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/PSO_optimization.py +173 -67
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/utilities.py +0 -1
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/pyproject.toml +5 -1
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/LICENSE +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/_particle_swarm_optimization.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/data_exploration.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/datasetmaster.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/logger.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/pytorch_models.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/trainer.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/ml_tools/vision_helpers.py +0 -0
- {dragon_ml_toolbox-2.2.1 → dragon_ml_toolbox-2.4.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -37,9 +37,11 @@ Requires-Dist: Pillow
|
|
|
37
37
|
Provides-Extra: pytorch
|
|
38
38
|
Requires-Dist: torch; extra == "pytorch"
|
|
39
39
|
Requires-Dist: torchvision; extra == "pytorch"
|
|
40
|
+
Provides-Extra: gui
|
|
41
|
+
Requires-Dist: FreeSimpleGUI>=5.2; extra == "gui"
|
|
40
42
|
Dynamic: license-file
|
|
41
43
|
|
|
42
|
-
# dragon-ml-
|
|
44
|
+
# dragon-ml-toolbox
|
|
43
45
|
|
|
44
46
|
A collection of Python utilities for data science and machine learning, structured as a modular package for easy reuse and installation.
|
|
45
47
|
|
|
@@ -57,7 +59,7 @@ A collection of Python utilities for data science and machine learning, structur
|
|
|
57
59
|
Install the latest stable release from PyPI:
|
|
58
60
|
|
|
59
61
|
```bash
|
|
60
|
-
pip install dragon-ml-
|
|
62
|
+
pip install dragon-ml-toolbox
|
|
61
63
|
```
|
|
62
64
|
|
|
63
65
|
### Via GitHub (Editable)
|
|
@@ -77,16 +79,26 @@ Install from the conda-forge channel:
|
|
|
77
79
|
```bash
|
|
78
80
|
conda install -c conda-forge dragon-ml-toolbox
|
|
79
81
|
```
|
|
80
|
-
**Note:** This version is outdated or broken due to dependency incompatibilities.
|
|
82
|
+
**Note:** This version is outdated or broken due to dependency incompatibilities. Use PyPi instead.
|
|
81
83
|
|
|
82
84
|
## Optional dependencies
|
|
83
85
|
|
|
84
|
-
|
|
86
|
+
### FreeSimpleGUI
|
|
87
|
+
|
|
88
|
+
Wrapper library used to build powerful GUIs. Requires the tkinter backend.
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install dragon-ml-toolbox[gui]
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### PyTorch
|
|
95
|
+
|
|
96
|
+
Different builds available depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
|
|
85
97
|
|
|
86
98
|
Install the default CPU-only version with
|
|
87
99
|
|
|
88
100
|
```bash
|
|
89
|
-
pip install dragon-ml-
|
|
101
|
+
pip install dragon-ml-toolbox[pytorch]
|
|
90
102
|
```
|
|
91
103
|
|
|
92
104
|
To make use of GPU acceleration use the official PyTorch installation instructions:
|
|
@@ -108,6 +120,8 @@ from ml_tools.logger import custom_logger
|
|
|
108
120
|
data_exploration
|
|
109
121
|
datasetmaster
|
|
110
122
|
ensemble_learning
|
|
123
|
+
ETL_engineering
|
|
124
|
+
GUI_tools
|
|
111
125
|
handle_excel
|
|
112
126
|
logger
|
|
113
127
|
MICE_imputation
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# dragon-ml-
|
|
1
|
+
# dragon-ml-toolbox
|
|
2
2
|
|
|
3
3
|
A collection of Python utilities for data science and machine learning, structured as a modular package for easy reuse and installation.
|
|
4
4
|
|
|
@@ -16,7 +16,7 @@ A collection of Python utilities for data science and machine learning, structur
|
|
|
16
16
|
Install the latest stable release from PyPI:
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
|
-
pip install dragon-ml-
|
|
19
|
+
pip install dragon-ml-toolbox
|
|
20
20
|
```
|
|
21
21
|
|
|
22
22
|
### Via GitHub (Editable)
|
|
@@ -36,16 +36,26 @@ Install from the conda-forge channel:
|
|
|
36
36
|
```bash
|
|
37
37
|
conda install -c conda-forge dragon-ml-toolbox
|
|
38
38
|
```
|
|
39
|
-
**Note:** This version is outdated or broken due to dependency incompatibilities.
|
|
39
|
+
**Note:** This version is outdated or broken due to dependency incompatibilities. Use PyPi instead.
|
|
40
40
|
|
|
41
41
|
## Optional dependencies
|
|
42
42
|
|
|
43
|
-
|
|
43
|
+
### FreeSimpleGUI
|
|
44
|
+
|
|
45
|
+
Wrapper library used to build powerful GUIs. Requires the tkinter backend.
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install dragon-ml-toolbox[gui]
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### PyTorch
|
|
52
|
+
|
|
53
|
+
Different builds available depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
|
|
44
54
|
|
|
45
55
|
Install the default CPU-only version with
|
|
46
56
|
|
|
47
57
|
```bash
|
|
48
|
-
pip install dragon-ml-
|
|
58
|
+
pip install dragon-ml-toolbox[pytorch]
|
|
49
59
|
```
|
|
50
60
|
|
|
51
61
|
To make use of GPU acceleration use the official PyTorch installation instructions:
|
|
@@ -67,6 +77,8 @@ from ml_tools.logger import custom_logger
|
|
|
67
77
|
data_exploration
|
|
68
78
|
datasetmaster
|
|
69
79
|
ensemble_learning
|
|
80
|
+
ETL_engineering
|
|
81
|
+
GUI_tools
|
|
70
82
|
handle_excel
|
|
71
83
|
logger
|
|
72
84
|
MICE_imputation
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -37,9 +37,11 @@ Requires-Dist: Pillow
|
|
|
37
37
|
Provides-Extra: pytorch
|
|
38
38
|
Requires-Dist: torch; extra == "pytorch"
|
|
39
39
|
Requires-Dist: torchvision; extra == "pytorch"
|
|
40
|
+
Provides-Extra: gui
|
|
41
|
+
Requires-Dist: FreeSimpleGUI>=5.2; extra == "gui"
|
|
40
42
|
Dynamic: license-file
|
|
41
43
|
|
|
42
|
-
# dragon-ml-
|
|
44
|
+
# dragon-ml-toolbox
|
|
43
45
|
|
|
44
46
|
A collection of Python utilities for data science and machine learning, structured as a modular package for easy reuse and installation.
|
|
45
47
|
|
|
@@ -57,7 +59,7 @@ A collection of Python utilities for data science and machine learning, structur
|
|
|
57
59
|
Install the latest stable release from PyPI:
|
|
58
60
|
|
|
59
61
|
```bash
|
|
60
|
-
pip install dragon-ml-
|
|
62
|
+
pip install dragon-ml-toolbox
|
|
61
63
|
```
|
|
62
64
|
|
|
63
65
|
### Via GitHub (Editable)
|
|
@@ -77,16 +79,26 @@ Install from the conda-forge channel:
|
|
|
77
79
|
```bash
|
|
78
80
|
conda install -c conda-forge dragon-ml-toolbox
|
|
79
81
|
```
|
|
80
|
-
**Note:** This version is outdated or broken due to dependency incompatibilities.
|
|
82
|
+
**Note:** This version is outdated or broken due to dependency incompatibilities. Use PyPi instead.
|
|
81
83
|
|
|
82
84
|
## Optional dependencies
|
|
83
85
|
|
|
84
|
-
|
|
86
|
+
### FreeSimpleGUI
|
|
87
|
+
|
|
88
|
+
Wrapper library used to build powerful GUIs. Requires the tkinter backend.
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install dragon-ml-toolbox[gui]
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### PyTorch
|
|
95
|
+
|
|
96
|
+
Different builds available depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
|
|
85
97
|
|
|
86
98
|
Install the default CPU-only version with
|
|
87
99
|
|
|
88
100
|
```bash
|
|
89
|
-
pip install dragon-ml-
|
|
101
|
+
pip install dragon-ml-toolbox[pytorch]
|
|
90
102
|
```
|
|
91
103
|
|
|
92
104
|
To make use of GPU acceleration use the official PyTorch installation instructions:
|
|
@@ -108,6 +120,8 @@ from ml_tools.logger import custom_logger
|
|
|
108
120
|
data_exploration
|
|
109
121
|
datasetmaster
|
|
110
122
|
ensemble_learning
|
|
123
|
+
ETL_engineering
|
|
124
|
+
GUI_tools
|
|
111
125
|
handle_excel
|
|
112
126
|
logger
|
|
113
127
|
MICE_imputation
|
|
@@ -8,6 +8,7 @@ dragon_ml_toolbox.egg-info/dependency_links.txt
|
|
|
8
8
|
dragon_ml_toolbox.egg-info/requires.txt
|
|
9
9
|
dragon_ml_toolbox.egg-info/top_level.txt
|
|
10
10
|
ml_tools/ETL_engineering.py
|
|
11
|
+
ml_tools/GUI_tools.py
|
|
11
12
|
ml_tools/MICE_imputation.py
|
|
12
13
|
ml_tools/PSO_optimization.py
|
|
13
14
|
ml_tools/VIF_factor.py
|
|
@@ -2,19 +2,120 @@ import polars as pl
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import Literal, Union, Optional, Any, Callable, List, Dict
|
|
4
4
|
from .utilities import _script_info
|
|
5
|
+
import pandas as pd
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
__all__ = [
|
|
9
|
+
"ColumnCleaner",
|
|
10
|
+
"DataFrameCleaner"
|
|
8
11
|
"TransformationRecipe",
|
|
9
12
|
"DataProcessor",
|
|
10
13
|
"KeywordDummifier",
|
|
11
14
|
"NumberExtractor",
|
|
12
15
|
"MultiNumberExtractor",
|
|
16
|
+
"RatioCalculator"
|
|
13
17
|
"CategoryMapper",
|
|
18
|
+
"RegexMapper",
|
|
14
19
|
"ValueBinner",
|
|
15
20
|
"DateFeatureExtractor"
|
|
16
21
|
]
|
|
17
22
|
|
|
23
|
+
########## EXTRACT and CLEAN ##########
|
|
24
|
+
|
|
25
|
+
class ColumnCleaner:
|
|
26
|
+
"""
|
|
27
|
+
Cleans and standardizes a single pandas Series based on a dictionary of regex-to-value replacement rules.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
rules (Dict[str, str]):
|
|
31
|
+
A dictionary where each key is a regular expression pattern and
|
|
32
|
+
each value is the standardized string to replace matches with.
|
|
33
|
+
"""
|
|
34
|
+
def __init__(self, rules: Dict[str, str]):
|
|
35
|
+
if not isinstance(rules, dict):
|
|
36
|
+
raise TypeError("The 'rules' argument must be a dictionary.")
|
|
37
|
+
|
|
38
|
+
# Validate that all keys are valid regular expressions
|
|
39
|
+
for pattern in rules.keys():
|
|
40
|
+
try:
|
|
41
|
+
re.compile(pattern)
|
|
42
|
+
except re.error as e:
|
|
43
|
+
raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
|
|
44
|
+
|
|
45
|
+
self.rules = rules
|
|
46
|
+
|
|
47
|
+
def clean(self, series: pd.Series) -> pd.Series:
|
|
48
|
+
"""
|
|
49
|
+
Applies the standardization rules to the provided Series (requires string data).
|
|
50
|
+
|
|
51
|
+
Non-matching values are kept as they are.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
series (pd.Series): The pandas Series to clean.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
pd.Series: A new Series with the values cleaned and standardized.
|
|
58
|
+
"""
|
|
59
|
+
return series.astype(str).replace(self.rules, regex=True)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DataFrameCleaner:
|
|
63
|
+
"""
|
|
64
|
+
Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
rules (Dict[str, Dict[str, str]]):
|
|
68
|
+
A nested dictionary where each top-level key is a column name,
|
|
69
|
+
and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
|
|
70
|
+
"""
|
|
71
|
+
def __init__(self, rules: Dict[str, Dict[str, str]]):
|
|
72
|
+
if not isinstance(rules, dict):
|
|
73
|
+
raise TypeError("The 'rules' argument must be a nested dictionary.")
|
|
74
|
+
|
|
75
|
+
for col_name, col_rules in rules.items():
|
|
76
|
+
if not isinstance(col_rules, dict):
|
|
77
|
+
raise TypeError(
|
|
78
|
+
f"The value for column '{col_name}' must be a dictionary "
|
|
79
|
+
f"of rules, but got type {type(col_rules).__name__}."
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
self.rules = rules
|
|
83
|
+
|
|
84
|
+
def clean(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
85
|
+
"""
|
|
86
|
+
Applies all defined cleaning rules to the DataFrame.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
df (pd.DataFrame): The pandas DataFrame to clean.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
pd.DataFrame: A new, cleaned DataFrame.
|
|
93
|
+
"""
|
|
94
|
+
rule_columns = set(self.rules.keys())
|
|
95
|
+
df_columns = set(df.columns)
|
|
96
|
+
|
|
97
|
+
missing_columns = rule_columns - df_columns
|
|
98
|
+
|
|
99
|
+
if missing_columns:
|
|
100
|
+
# Report all missing columns in a single, clear error message
|
|
101
|
+
raise ValueError(
|
|
102
|
+
f"The following columns specified in the cleaning rules "
|
|
103
|
+
f"were not found in the DataFrame: {sorted(list(missing_columns))}"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Start the process
|
|
107
|
+
df_cleaned = df.copy()
|
|
108
|
+
|
|
109
|
+
for column_name, column_rules in self.rules.items():
|
|
110
|
+
# Create and apply the specific cleaner for the column
|
|
111
|
+
cleaner = ColumnCleaner(rules=column_rules)
|
|
112
|
+
df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
|
|
113
|
+
|
|
114
|
+
return df_cleaned
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
############ TRANSFORM ####################
|
|
118
|
+
|
|
18
119
|
# Magic word for rename-only transformation
|
|
19
120
|
_RENAME = "rename"
|
|
20
121
|
|
|
@@ -336,8 +437,7 @@ class MultiNumberExtractor:
|
|
|
336
437
|
"""
|
|
337
438
|
Extracts multiple numbers from a single polars string column into several new columns.
|
|
338
439
|
|
|
339
|
-
This transformer is designed for one-to-many mappings, such as parsing
|
|
340
|
-
ratios (100:30) or coordinates (10, 25) into separate columns.
|
|
440
|
+
This transformer is designed for one-to-many mappings, such as parsing coordinates (10, 25) into separate columns.
|
|
341
441
|
|
|
342
442
|
Args:
|
|
343
443
|
num_outputs (int):
|
|
@@ -413,6 +513,59 @@ class MultiNumberExtractor:
|
|
|
413
513
|
return pl.select(output_expressions)
|
|
414
514
|
|
|
415
515
|
|
|
516
|
+
class RatioCalculator:
|
|
517
|
+
"""
|
|
518
|
+
A transformer that parses a string ratio (e.g., "40:5" or "30/2") and computes the result of the division.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
regex_pattern (str, optional):
|
|
522
|
+
The regex pattern to find the numerator and denominator. It MUST
|
|
523
|
+
contain exactly two capturing groups: the first for the
|
|
524
|
+
numerator and the second for the denominator. Defaults to a
|
|
525
|
+
pattern that handles common delimiters like ':' and '/'.
|
|
526
|
+
"""
|
|
527
|
+
def __init__(
|
|
528
|
+
self,
|
|
529
|
+
regex_pattern: str = r"(\d+\.?\d*)\s*[:/]\s*(\d+\.?\d*)"
|
|
530
|
+
):
|
|
531
|
+
# --- Validation ---
|
|
532
|
+
try:
|
|
533
|
+
if re.compile(regex_pattern).groups != 2:
|
|
534
|
+
raise ValueError(
|
|
535
|
+
"regex_pattern must contain exactly two "
|
|
536
|
+
"capturing groups '(...)'."
|
|
537
|
+
)
|
|
538
|
+
except re.error as e:
|
|
539
|
+
raise ValueError(f"Invalid regex pattern provided: {e}") from e
|
|
540
|
+
|
|
541
|
+
self.regex_pattern = regex_pattern
|
|
542
|
+
|
|
543
|
+
def __call__(self, column: pl.Series) -> pl.Series:
|
|
544
|
+
"""
|
|
545
|
+
Applies the ratio calculation logic to the input column.
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
column (pl.Series): The input Polars Series of ratio strings.
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
pl.Series: A new Series of floats containing the division result.
|
|
552
|
+
Returns null for invalid formats or division by zero.
|
|
553
|
+
"""
|
|
554
|
+
# .extract_groups returns a struct with a field for each capture group
|
|
555
|
+
# e.g., {"group_1": "40", "group_2": "5"}
|
|
556
|
+
groups = column.str.extract_groups(self.regex_pattern)
|
|
557
|
+
|
|
558
|
+
# Extract numerator and denominator, casting to float
|
|
559
|
+
# strict=False ensures that non-matches become null
|
|
560
|
+
numerator = groups.struct.field("group_1").cast(pl.Float64, strict=False)
|
|
561
|
+
denominator = groups.struct.field("group_2").cast(pl.Float64, strict=False)
|
|
562
|
+
|
|
563
|
+
# Safely perform division, returning null if denominator is 0
|
|
564
|
+
return pl.when(denominator != 0).then(
|
|
565
|
+
numerator / denominator
|
|
566
|
+
).otherwise(None)
|
|
567
|
+
|
|
568
|
+
|
|
416
569
|
class CategoryMapper:
|
|
417
570
|
"""
|
|
418
571
|
A transformer that maps string categories to specified numerical values using a dictionary.
|
|
@@ -468,6 +621,74 @@ class CategoryMapper:
|
|
|
468
621
|
return pl.select(final_expr).to_series()
|
|
469
622
|
|
|
470
623
|
|
|
624
|
+
class RegexMapper:
|
|
625
|
+
"""
|
|
626
|
+
A transformer that maps string categories to numerical values based on a
|
|
627
|
+
dictionary of regular expression patterns.
|
|
628
|
+
|
|
629
|
+
The class iterates through the mapping dictionary in order, and the first
|
|
630
|
+
pattern that matches a given string determines the output value. This
|
|
631
|
+
"first match wins" logic makes the order of the mapping important.
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
mapping (Dict[str, Union[int, float]]):
|
|
635
|
+
An ordered dictionary where keys are regex patterns and values are
|
|
636
|
+
the numbers to map to if the pattern is found.
|
|
637
|
+
unseen_value (Optional[Union[int, float]], optional):
|
|
638
|
+
The numerical value to use for strings that do not match any
|
|
639
|
+
of the regex patterns. If None (default), unseen values are
|
|
640
|
+
mapped to null.
|
|
641
|
+
"""
|
|
642
|
+
def __init__(
|
|
643
|
+
self,
|
|
644
|
+
mapping: Dict[str, Union[int, float]],
|
|
645
|
+
unseen_value: Optional[Union[int, float]] = None,
|
|
646
|
+
):
|
|
647
|
+
# --- Validation ---
|
|
648
|
+
if not isinstance(mapping, dict):
|
|
649
|
+
raise TypeError("The 'mapping' argument must be a dictionary.")
|
|
650
|
+
|
|
651
|
+
for pattern, value in mapping.items():
|
|
652
|
+
try:
|
|
653
|
+
re.compile(pattern)
|
|
654
|
+
except re.error as e:
|
|
655
|
+
raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
|
|
656
|
+
if not isinstance(value, (int, float)):
|
|
657
|
+
raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
|
|
658
|
+
|
|
659
|
+
self.mapping = mapping
|
|
660
|
+
self.unseen_value = unseen_value
|
|
661
|
+
|
|
662
|
+
def __call__(self, column: pl.Series) -> pl.Series:
|
|
663
|
+
"""
|
|
664
|
+
Applies the regex mapping logic to the input column.
|
|
665
|
+
|
|
666
|
+
Args:
|
|
667
|
+
column (pl.Series): The input Polars Series of string data.
|
|
668
|
+
|
|
669
|
+
Returns:
|
|
670
|
+
pl.Series: A new Series with strings mapped to numbers based on
|
|
671
|
+
the first matching regex pattern.
|
|
672
|
+
"""
|
|
673
|
+
# Ensure the column is treated as a string for matching
|
|
674
|
+
str_column = column.cast(pl.Utf8)
|
|
675
|
+
|
|
676
|
+
# Build the when/then/otherwise chain from the inside out.
|
|
677
|
+
# Start with the final fallback value for non-matches.
|
|
678
|
+
mapping_expr = pl.lit(self.unseen_value)
|
|
679
|
+
|
|
680
|
+
# Iterate through the mapping in reverse to construct the nested expression
|
|
681
|
+
for pattern, value in reversed(list(self.mapping.items())):
|
|
682
|
+
mapping_expr = (
|
|
683
|
+
pl.when(str_column.str.contains(pattern))
|
|
684
|
+
.then(pl.lit(value))
|
|
685
|
+
.otherwise(mapping_expr)
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# Execute the complete expression chain and return the resulting Series
|
|
689
|
+
return pl.select(mapping_expr).to_series()
|
|
690
|
+
|
|
691
|
+
|
|
471
692
|
class ValueBinner:
|
|
472
693
|
"""
|
|
473
694
|
A transformer that discretizes a continuous numerical column into a finite number of bins.
|