dragon-ml-toolbox 10.14.0__tar.gz → 11.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.14.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-11.0.0}/PKG-INFO +6 -1
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/README.md +5 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0/dragon_ml_toolbox.egg-info}/PKG-INFO +6 -1
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +1 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ETL_cleaning.py +21 -8
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ETL_engineering.py +124 -23
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ML_optimization.py +103 -3
- dragon_ml_toolbox-11.0.0/ml_tools/constants.py +79 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/pyproject.toml +1 -1
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/LICENSE +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ML_datasetmaster.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ML_evaluation_multi.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ML_inference.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ML_models.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ML_scaler.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/SQL.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/_logger.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/_script_info.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/custom_logger.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/data_exploration.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ensemble_inference.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/optimization_tools.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/path_manager.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 11.0.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -139,6 +139,7 @@ pip install "dragon-ml-toolbox[pytorch]"
|
|
|
139
139
|
#### Modules:
|
|
140
140
|
|
|
141
141
|
```bash
|
|
142
|
+
constants
|
|
142
143
|
custom_logger
|
|
143
144
|
data_exploration
|
|
144
145
|
ensemble_evaluation
|
|
@@ -176,6 +177,7 @@ pip install "dragon-ml-toolbox[mice]"
|
|
|
176
177
|
#### Modules:
|
|
177
178
|
|
|
178
179
|
```Bash
|
|
180
|
+
constants
|
|
179
181
|
custom_logger
|
|
180
182
|
MICE_imputation
|
|
181
183
|
VIF_factor
|
|
@@ -196,6 +198,7 @@ pip install "dragon-ml-toolbox[excel]"
|
|
|
196
198
|
#### Modules:
|
|
197
199
|
|
|
198
200
|
```Bash
|
|
201
|
+
constants
|
|
199
202
|
custom_logger
|
|
200
203
|
handle_excel
|
|
201
204
|
path_manager
|
|
@@ -218,6 +221,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
|
|
|
218
221
|
#### Modules:
|
|
219
222
|
|
|
220
223
|
```Bash
|
|
224
|
+
constants
|
|
221
225
|
custom_logger
|
|
222
226
|
GUI_tools
|
|
223
227
|
ensemble_inference
|
|
@@ -241,6 +245,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
|
|
|
241
245
|
#### Modules:
|
|
242
246
|
|
|
243
247
|
```Bash
|
|
248
|
+
constants
|
|
244
249
|
custom_logger
|
|
245
250
|
GUI_tools
|
|
246
251
|
ML_models
|
|
@@ -58,6 +58,7 @@ pip install "dragon-ml-toolbox[pytorch]"
|
|
|
58
58
|
#### Modules:
|
|
59
59
|
|
|
60
60
|
```bash
|
|
61
|
+
constants
|
|
61
62
|
custom_logger
|
|
62
63
|
data_exploration
|
|
63
64
|
ensemble_evaluation
|
|
@@ -95,6 +96,7 @@ pip install "dragon-ml-toolbox[mice]"
|
|
|
95
96
|
#### Modules:
|
|
96
97
|
|
|
97
98
|
```Bash
|
|
99
|
+
constants
|
|
98
100
|
custom_logger
|
|
99
101
|
MICE_imputation
|
|
100
102
|
VIF_factor
|
|
@@ -115,6 +117,7 @@ pip install "dragon-ml-toolbox[excel]"
|
|
|
115
117
|
#### Modules:
|
|
116
118
|
|
|
117
119
|
```Bash
|
|
120
|
+
constants
|
|
118
121
|
custom_logger
|
|
119
122
|
handle_excel
|
|
120
123
|
path_manager
|
|
@@ -137,6 +140,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
|
|
|
137
140
|
#### Modules:
|
|
138
141
|
|
|
139
142
|
```Bash
|
|
143
|
+
constants
|
|
140
144
|
custom_logger
|
|
141
145
|
GUI_tools
|
|
142
146
|
ensemble_inference
|
|
@@ -160,6 +164,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
|
|
|
160
164
|
#### Modules:
|
|
161
165
|
|
|
162
166
|
```Bash
|
|
167
|
+
constants
|
|
163
168
|
custom_logger
|
|
164
169
|
GUI_tools
|
|
165
170
|
ML_models
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 11.0.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -139,6 +139,7 @@ pip install "dragon-ml-toolbox[pytorch]"
|
|
|
139
139
|
#### Modules:
|
|
140
140
|
|
|
141
141
|
```bash
|
|
142
|
+
constants
|
|
142
143
|
custom_logger
|
|
143
144
|
data_exploration
|
|
144
145
|
ensemble_evaluation
|
|
@@ -176,6 +177,7 @@ pip install "dragon-ml-toolbox[mice]"
|
|
|
176
177
|
#### Modules:
|
|
177
178
|
|
|
178
179
|
```Bash
|
|
180
|
+
constants
|
|
179
181
|
custom_logger
|
|
180
182
|
MICE_imputation
|
|
181
183
|
VIF_factor
|
|
@@ -196,6 +198,7 @@ pip install "dragon-ml-toolbox[excel]"
|
|
|
196
198
|
#### Modules:
|
|
197
199
|
|
|
198
200
|
```Bash
|
|
201
|
+
constants
|
|
199
202
|
custom_logger
|
|
200
203
|
handle_excel
|
|
201
204
|
path_manager
|
|
@@ -218,6 +221,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
|
|
|
218
221
|
#### Modules:
|
|
219
222
|
|
|
220
223
|
```Bash
|
|
224
|
+
constants
|
|
221
225
|
custom_logger
|
|
222
226
|
GUI_tools
|
|
223
227
|
ensemble_inference
|
|
@@ -241,6 +245,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
|
|
|
241
245
|
#### Modules:
|
|
242
246
|
|
|
243
247
|
```Bash
|
|
248
|
+
constants
|
|
244
249
|
custom_logger
|
|
245
250
|
GUI_tools
|
|
246
251
|
ML_models
|
|
@@ -19,20 +19,26 @@ __all__ = [
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
################ Unique Values per column #################
|
|
22
|
-
def save_unique_values(csv_path: Union[str, Path],
|
|
22
|
+
def save_unique_values(csv_path: Union[str, Path],
|
|
23
|
+
output_dir: Union[str, Path],
|
|
24
|
+
verbose: bool=False,
|
|
25
|
+
keep_column_order: bool = False) -> None:
|
|
23
26
|
"""
|
|
24
27
|
Loads a CSV file, then analyzes it and saves the unique non-null values
|
|
25
28
|
from each column into a separate text file exactly as they appear.
|
|
26
29
|
|
|
27
30
|
This is useful for understanding the raw categories or range of values
|
|
28
|
-
within a dataset before cleaning.
|
|
31
|
+
within a dataset before and after cleaning.
|
|
29
32
|
|
|
30
33
|
Args:
|
|
31
|
-
csv_path (
|
|
34
|
+
csv_path (str | Path):
|
|
32
35
|
The file path to the input CSV file.
|
|
33
|
-
output_dir (
|
|
36
|
+
output_dir (str | Path):
|
|
34
37
|
The path to the directory where the .txt files will be saved.
|
|
35
38
|
The directory will be created if it does not exist.
|
|
39
|
+
keep_column_order (bool):
|
|
40
|
+
If True, prepends a numeric prefix (e.g., '1_', '2_') to each
|
|
41
|
+
output filename to maintain the original column order.
|
|
36
42
|
"""
|
|
37
43
|
# --- 1. Input Validation ---
|
|
38
44
|
csv_path = make_fullpath(input_path=csv_path, enforce="file")
|
|
@@ -74,7 +80,12 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
|
|
|
74
80
|
sanitized_name = sanitize_filename(column_name)
|
|
75
81
|
if not sanitized_name.strip('_'):
|
|
76
82
|
sanitized_name = f'column_{i}'
|
|
77
|
-
|
|
83
|
+
|
|
84
|
+
# --- create filename prefix ---
|
|
85
|
+
# If keep_column_order is True, create a prefix like "1_", "2_", etc.
|
|
86
|
+
prefix = f"{i + 1}_" if keep_column_order else ''
|
|
87
|
+
|
|
88
|
+
file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
|
|
78
89
|
|
|
79
90
|
# --- Write to file ---
|
|
80
91
|
try:
|
|
@@ -126,9 +137,10 @@ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
|
|
|
126
137
|
's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
|
|
127
138
|
'y': 'y', 'z': 'z',
|
|
128
139
|
# Punctuation
|
|
129
|
-
'》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']',
|
|
140
|
+
'》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']', '∼': '~',
|
|
130
141
|
'(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
|
|
131
|
-
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '
|
|
142
|
+
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '', '⋅': '',
|
|
143
|
+
'¯': '-',
|
|
132
144
|
|
|
133
145
|
# Commas (avoid commas in entries)
|
|
134
146
|
',': ';',
|
|
@@ -136,6 +148,8 @@ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
|
|
|
136
148
|
'、':';',
|
|
137
149
|
|
|
138
150
|
# Others
|
|
151
|
+
'σ': '',
|
|
152
|
+
'□': '',
|
|
139
153
|
'©': '',
|
|
140
154
|
'®': '',
|
|
141
155
|
'™': '',
|
|
@@ -143,7 +157,6 @@ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
|
|
|
143
157
|
|
|
144
158
|
# Replace special characters in entries
|
|
145
159
|
r'\\': '_',
|
|
146
|
-
# '/': '_', # keep forward slash
|
|
147
160
|
|
|
148
161
|
# Typographical standardization
|
|
149
162
|
# Unify various dashes and hyphens to a standard hyphen
|
|
@@ -6,6 +6,7 @@ from .utilities import load_dataframe, save_dataframe
|
|
|
6
6
|
from .path_manager import make_fullpath
|
|
7
7
|
from ._script_info import _script_info
|
|
8
8
|
from ._logger import _LOGGER
|
|
9
|
+
from .constants import CHEMICAL_ELEMENT_SYMBOLS
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
__all__ = [
|
|
@@ -24,7 +25,8 @@ __all__ = [
|
|
|
24
25
|
"CategoryMapper",
|
|
25
26
|
"RegexMapper",
|
|
26
27
|
"ValueBinner",
|
|
27
|
-
"DateFeatureExtractor"
|
|
28
|
+
"DateFeatureExtractor",
|
|
29
|
+
"MolecularFormulaTransformer"
|
|
28
30
|
]
|
|
29
31
|
|
|
30
32
|
############ TRANSFORM MAIN ####################
|
|
@@ -48,17 +50,20 @@ class TransformationRecipe:
|
|
|
48
50
|
def add(
|
|
49
51
|
self,
|
|
50
52
|
input_col_name: str,
|
|
51
|
-
output_col_names: Union[str, List[str]],
|
|
52
53
|
transform: Union[str, Callable],
|
|
54
|
+
output_col_names: Optional[Union[str, List[str]]] = None
|
|
53
55
|
) -> "TransformationRecipe":
|
|
54
56
|
"""
|
|
55
57
|
Adds a new transformation step to the recipe.
|
|
56
58
|
|
|
57
59
|
Args:
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
A string for a 1-to-1 mapping
|
|
61
|
-
for a 1-to-many mapping.
|
|
60
|
+
input_col_name: The name of the column from the source DataFrame.
|
|
61
|
+
output_col_names: The desired name(s) for the output column(s).
|
|
62
|
+
- A string for a 1-to-1 mapping.
|
|
63
|
+
- A list of strings for a 1-to-many mapping.
|
|
64
|
+
- A string prefix for 1-to-many mapping.
|
|
65
|
+
- If None, the input name is used for 1-to-1 transforms,
|
|
66
|
+
or the transformer's default names are used for 1-to-many.
|
|
62
67
|
transform: The transformation to apply:
|
|
63
68
|
- Use "rename" for simple column renaming
|
|
64
69
|
- If callable, must accept a `pl.Series` as the only parameter and return either a `pl.Series` or `pl.DataFrame`.
|
|
@@ -78,10 +83,6 @@ class TransformationRecipe:
|
|
|
78
83
|
elif not isinstance(transform, Callable):
|
|
79
84
|
_LOGGER.error(f"'transform' must be a callable function or the string '{_RENAME}'.")
|
|
80
85
|
raise TypeError()
|
|
81
|
-
|
|
82
|
-
if isinstance(output_col_names, list) and transform == _RENAME:
|
|
83
|
-
_LOGGER.error("A RENAME operation cannot have a list of output columns.")
|
|
84
|
-
raise ValueError()
|
|
85
86
|
|
|
86
87
|
# --- Add Step ---
|
|
87
88
|
step = {
|
|
@@ -148,33 +149,53 @@ class DataProcessor:
|
|
|
148
149
|
result = transform_action(input_series)
|
|
149
150
|
|
|
150
151
|
if isinstance(result, pl.Series):
|
|
151
|
-
if
|
|
152
|
-
|
|
152
|
+
# Default to input name if spec is None
|
|
153
|
+
output_name = output_col_spec if output_col_spec is not None else input_col_name
|
|
154
|
+
|
|
155
|
+
if not isinstance(output_name, str):
|
|
156
|
+
_LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' must be a string or None.")
|
|
153
157
|
raise TypeError()
|
|
154
|
-
processed_columns.append(result.alias(
|
|
158
|
+
processed_columns.append(result.alias(output_name))
|
|
155
159
|
|
|
156
160
|
elif isinstance(result, pl.DataFrame):
|
|
157
|
-
# 1. Handle
|
|
158
|
-
if
|
|
161
|
+
# 1. Handle None in output names
|
|
162
|
+
if output_col_spec is None:
|
|
163
|
+
# Use the column names generated by the transformer directly
|
|
164
|
+
processed_columns.extend(result.get_columns())
|
|
165
|
+
|
|
166
|
+
# 2. Handle list-based renaming
|
|
167
|
+
elif isinstance(output_col_spec, list):
|
|
159
168
|
if len(result.columns) != len(output_col_spec):
|
|
160
169
|
_LOGGER.error(f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, but recipe specifies {len(output_col_spec)} output names.")
|
|
161
170
|
raise ValueError()
|
|
162
171
|
|
|
163
172
|
renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
|
|
164
173
|
processed_columns.extend(renamed_df.get_columns())
|
|
165
|
-
|
|
166
|
-
#
|
|
174
|
+
|
|
175
|
+
# 3. Global logic for adding a single prefix to all columns.
|
|
167
176
|
elif isinstance(output_col_spec, str):
|
|
168
177
|
prefix = output_col_spec
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
178
|
+
new_names = {}
|
|
179
|
+
|
|
180
|
+
for col in result.columns:
|
|
181
|
+
# Case 1: Transformer's output column name contains the input name.
|
|
182
|
+
# Action: Replace the input name with the desired prefix.
|
|
183
|
+
# Example: input='color', output='color_red', prefix='spec' -> 'spec_red'
|
|
184
|
+
if input_col_name in col:
|
|
185
|
+
new_names[col] = col.replace(input_col_name, prefix, 1)
|
|
186
|
+
|
|
187
|
+
# Case 2: Transformer's output is an independent name.
|
|
188
|
+
# Action: Prepend the prefix to the output name.
|
|
189
|
+
# Example: input='ratio', output='A_div_B', prefix='spec' -> 'spec_A_div_B'
|
|
190
|
+
else:
|
|
191
|
+
new_names[col] = f"{prefix}_{col}"
|
|
192
|
+
|
|
173
193
|
renamed_df = result.rename(new_names)
|
|
174
|
-
processed_columns.extend(renamed_df.get_columns())
|
|
194
|
+
processed_columns.extend(renamed_df.get_columns())
|
|
195
|
+
|
|
175
196
|
|
|
176
197
|
else:
|
|
177
|
-
_LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names
|
|
198
|
+
_LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names, a string prefix, or None.")
|
|
178
199
|
raise TypeError()
|
|
179
200
|
|
|
180
201
|
else:
|
|
@@ -1242,5 +1263,85 @@ class DateFeatureExtractor:
|
|
|
1242
1263
|
return pl.select(output_expressions)
|
|
1243
1264
|
|
|
1244
1265
|
|
|
1266
|
+
class MolecularFormulaTransformer:
|
|
1267
|
+
"""
|
|
1268
|
+
Parses a Polars Series of molecular formula strings into a wide DataFrame.
|
|
1269
|
+
|
|
1270
|
+
This one-to-many transformer takes a column of condensed molecular formulas
|
|
1271
|
+
(e.g., 'Li0.115Mn0.529Ni0.339O2') and converts it into a DataFrame where
|
|
1272
|
+
each chemical element has its own column. The value in each column is the
|
|
1273
|
+
stoichiometric quantity of that element.
|
|
1274
|
+
|
|
1275
|
+
It is designed to be used within the DataProcessor pipeline.
|
|
1276
|
+
"""
|
|
1277
|
+
|
|
1278
|
+
def __init__(self, prefix: str = "Fraction", separator: str = "_"):
|
|
1279
|
+
"""
|
|
1280
|
+
Initializes the transformer and pre-compiles the regex pattern.
|
|
1281
|
+
|
|
1282
|
+
Args:
|
|
1283
|
+
prefix (str): The prefix for the output column names. Defaults to "Fraction".
|
|
1284
|
+
separator (str): The separator between the prefix and element symbol. Defaults to "_".
|
|
1285
|
+
"""
|
|
1286
|
+
if not isinstance(prefix, str) or not isinstance(separator, str):
|
|
1287
|
+
_LOGGER.error("'prefix' and 'separator' must be strings.")
|
|
1288
|
+
raise TypeError()
|
|
1289
|
+
|
|
1290
|
+
self.prefix = prefix
|
|
1291
|
+
self.separator = separator
|
|
1292
|
+
# Sort symbols by length to prevent matching 'C' in 'Co'
|
|
1293
|
+
sorted_symbols = sorted(CHEMICAL_ELEMENT_SYMBOLS, key=len, reverse=True)
|
|
1294
|
+
|
|
1295
|
+
# Pre-compile regex for efficiency
|
|
1296
|
+
self.pattern = re.compile(rf'({"|".join(sorted_symbols)})(\d*\.?\d*)')
|
|
1297
|
+
|
|
1298
|
+
def __call__(self, column: pl.Series) -> pl.DataFrame:
|
|
1299
|
+
"""
|
|
1300
|
+
Executes the formula parsing logic.
|
|
1301
|
+
|
|
1302
|
+
Args:
|
|
1303
|
+
column: A Polars Series containing strings of molecular formulas.
|
|
1304
|
+
|
|
1305
|
+
Returns:
|
|
1306
|
+
A Polars DataFrame with columns for every chemical element.
|
|
1307
|
+
"""
|
|
1308
|
+
def parse_formula(formula: str) -> dict:
|
|
1309
|
+
"""Helper to parse a single formula string into a dictionary."""
|
|
1310
|
+
if not isinstance(formula, str) or not formula:
|
|
1311
|
+
return {}
|
|
1312
|
+
|
|
1313
|
+
matches = self.pattern.findall(formula)
|
|
1314
|
+
|
|
1315
|
+
# This dict comprehension is correct for your use case where
|
|
1316
|
+
# each element appears only once in the formula string.
|
|
1317
|
+
return {
|
|
1318
|
+
element: float(value) if value else 1.0
|
|
1319
|
+
for element, value in matches
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
# Apply the parsing function to each element
|
|
1323
|
+
parsed_series = column.map_elements(parse_formula, return_dtype=pl.Object)
|
|
1324
|
+
|
|
1325
|
+
# Convert the Series of dictionaries into a DataFrame
|
|
1326
|
+
df = pl.DataFrame(parsed_series.to_list())
|
|
1327
|
+
|
|
1328
|
+
# Ensure all possible element columns are created, filling with 0
|
|
1329
|
+
select_expressions = []
|
|
1330
|
+
for symbol in CHEMICAL_ELEMENT_SYMBOLS:
|
|
1331
|
+
col_name = f"{self.prefix}{self.separator}{symbol}"
|
|
1332
|
+
if symbol in df.columns:
|
|
1333
|
+
expr = pl.col(symbol).fill_null(0).alias(col_name)
|
|
1334
|
+
else:
|
|
1335
|
+
expr = pl.lit(0.0, dtype=pl.Float64).alias(col_name)
|
|
1336
|
+
select_expressions.append(expr)
|
|
1337
|
+
|
|
1338
|
+
# Handle edge case where input series is not empty but parsing yields no rows
|
|
1339
|
+
base_df = df
|
|
1340
|
+
if df.height == 0 and column.len() > 0:
|
|
1341
|
+
base_df = pl.DataFrame({'dummy': range(column.len())})
|
|
1342
|
+
|
|
1343
|
+
return base_df.select(select_expressions)
|
|
1344
|
+
|
|
1345
|
+
|
|
1245
1346
|
def info():
|
|
1246
1347
|
_script_info(__all__)
|
|
@@ -20,12 +20,112 @@ from .SQL import DatabaseManager
|
|
|
20
20
|
from .optimization_tools import _save_result
|
|
21
21
|
from .utilities import threshold_binary_values, save_dataframe
|
|
22
22
|
|
|
23
|
+
|
|
23
24
|
__all__ = [
|
|
25
|
+
"MLOptimizer",
|
|
24
26
|
"create_pytorch_problem",
|
|
25
27
|
"run_optimization"
|
|
26
28
|
]
|
|
27
29
|
|
|
28
30
|
|
|
31
|
+
class MLOptimizer:
|
|
32
|
+
"""
|
|
33
|
+
A wrapper class for setting up and running EvoTorch optimization tasks.
|
|
34
|
+
|
|
35
|
+
This class combines the functionality of `create_pytorch_problem` and
|
|
36
|
+
`run_optimization` functions into a single, streamlined workflow.
|
|
37
|
+
|
|
38
|
+
SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
>>> # 1. Initialize the optimizer with model and search parameters
|
|
42
|
+
>>> optimizer = MLOptimizer(
|
|
43
|
+
... inference_handler=my_handler,
|
|
44
|
+
... bounds=(lower_bounds, upper_bounds),
|
|
45
|
+
... number_binary_features=2,
|
|
46
|
+
... task="max",
|
|
47
|
+
... algorithm="Genetic"
|
|
48
|
+
... )
|
|
49
|
+
>>> # 2. Run the optimization and save the results
|
|
50
|
+
>>> best_result = optimizer.run(
|
|
51
|
+
... num_generations=100,
|
|
52
|
+
... target_name="my_target",
|
|
53
|
+
... feature_names=my_feature_names,
|
|
54
|
+
... save_dir="/path/to/results",
|
|
55
|
+
... save_format="csv"
|
|
56
|
+
... )
|
|
57
|
+
"""
|
|
58
|
+
def __init__(self,
|
|
59
|
+
inference_handler: PyTorchInferenceHandler,
|
|
60
|
+
bounds: Tuple[List[float], List[float]],
|
|
61
|
+
number_binary_features: int,
|
|
62
|
+
task: Literal["min", "max"],
|
|
63
|
+
algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
|
|
64
|
+
population_size: int = 200,
|
|
65
|
+
**searcher_kwargs):
|
|
66
|
+
"""
|
|
67
|
+
Initializes the optimizer by creating the EvoTorch problem and searcher.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
inference_handler (PyTorchInferenceHandler): An initialized inference handler containing the model and weights.
|
|
71
|
+
bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for the solution features.
|
|
72
|
+
number_binary_features (int): Number of binary features located at the END of the feature vector.
|
|
73
|
+
task (str): The optimization goal, either "min" or "max".
|
|
74
|
+
algorithm (str): The search algorithm to use ("SNES", "CEM", "Genetic").
|
|
75
|
+
population_size (int): Population size for CEM and GeneticAlgorithm.
|
|
76
|
+
**searcher_kwargs: Additional keyword arguments for the selected search algorithm's constructor.
|
|
77
|
+
"""
|
|
78
|
+
# Call the existing factory function to get the problem and searcher factory
|
|
79
|
+
self.problem, self.searcher_factory = create_pytorch_problem(
|
|
80
|
+
inference_handler=inference_handler,
|
|
81
|
+
bounds=bounds,
|
|
82
|
+
binary_features=number_binary_features,
|
|
83
|
+
task=task,
|
|
84
|
+
algorithm=algorithm,
|
|
85
|
+
population_size=population_size,
|
|
86
|
+
**searcher_kwargs
|
|
87
|
+
)
|
|
88
|
+
# Store binary_features count to pass it to the run function later
|
|
89
|
+
self._binary_features = number_binary_features
|
|
90
|
+
|
|
91
|
+
def run(self,
|
|
92
|
+
num_generations: int,
|
|
93
|
+
target_name: str,
|
|
94
|
+
save_dir: Union[str, Path],
|
|
95
|
+
feature_names: Optional[List[str]],
|
|
96
|
+
save_format: Literal['csv', 'sqlite', 'both'],
|
|
97
|
+
repetitions: int = 1,
|
|
98
|
+
verbose: bool = True) -> Optional[dict]:
|
|
99
|
+
"""
|
|
100
|
+
Runs the evolutionary optimization process using the pre-configured settings.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
num_generations (int): The total number of generations for each repetition.
|
|
104
|
+
target_name (str): Target name used for the CSV filename and/or SQL table.
|
|
105
|
+
save_dir (str | Path): The directory where result files will be saved.
|
|
106
|
+
feature_names (List[str] | None): Names of the solution features for labeling output. If None, generic names like 'feature_0', 'feature_1', ... , will be created.
|
|
107
|
+
save_format (Literal['csv', 'sqlite', 'both']): The format for saving results.
|
|
108
|
+
repetitions (int): The number of independent times to run the optimization.
|
|
109
|
+
verbose (bool): If True, enables detailed logging.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Optional[dict]: A dictionary with the best result if repetitions is 1, otherwise None.
|
|
113
|
+
"""
|
|
114
|
+
# Call the existing run function with the stored problem, searcher, and binary feature count
|
|
115
|
+
return run_optimization(
|
|
116
|
+
problem=self.problem,
|
|
117
|
+
searcher_factory=self.searcher_factory,
|
|
118
|
+
num_generations=num_generations,
|
|
119
|
+
target_name=target_name,
|
|
120
|
+
binary_features=self._binary_features,
|
|
121
|
+
save_dir=save_dir,
|
|
122
|
+
save_format=save_format,
|
|
123
|
+
feature_names=feature_names,
|
|
124
|
+
repetitions=repetitions,
|
|
125
|
+
verbose=verbose
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
29
129
|
def create_pytorch_problem(
|
|
30
130
|
inference_handler: PyTorchInferenceHandler,
|
|
31
131
|
bounds: Tuple[List[float], List[float]],
|
|
@@ -38,7 +138,7 @@ def create_pytorch_problem(
|
|
|
38
138
|
"""
|
|
39
139
|
Creates and configures an EvoTorch Problem and a Searcher factory class for a PyTorch model.
|
|
40
140
|
|
|
41
|
-
SNES and CEM do not accept bounds, the given bounds will be used as initial
|
|
141
|
+
SNES and CEM do not accept bounds, the given bounds will be used as an initial starting point.
|
|
42
142
|
|
|
43
143
|
The Genetic Algorithm works directly with the bounds, and operators such as SimulatedBinaryCrossOver and GaussianMutation.
|
|
44
144
|
|
|
@@ -62,8 +162,8 @@ def create_pytorch_problem(
|
|
|
62
162
|
|
|
63
163
|
# add binary bounds
|
|
64
164
|
if binary_features > 0:
|
|
65
|
-
lower_bounds.extend([0.
|
|
66
|
-
upper_bounds.extend([0.
|
|
165
|
+
lower_bounds.extend([0.48] * binary_features)
|
|
166
|
+
upper_bounds.extend([0.52] * binary_features)
|
|
67
167
|
|
|
68
168
|
solution_length = len(lower_bounds)
|
|
69
169
|
device = inference_handler.device
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
CHEMICAL_ELEMENTS = [
|
|
2
|
+
"Hydrogen", "Helium", "Lithium", "Beryllium", "Boron", "Carbon", "Nitrogen", "Oxygen", "Fluorine", "Neon",
|
|
3
|
+
"Sodium", "Magnesium", "Aluminum", "Silicon", "Phosphorus", "Sulfur", "Chlorine", "Argon",
|
|
4
|
+
"Potassium", "Calcium", "Scandium", "Titanium", "Vanadium", "Chromium", "Manganese", "Iron", "Cobalt", "Nickel", "Copper", "Zinc",
|
|
5
|
+
"Gallium", "Germanium", "Arsenic", "Selenium", "Bromine", "Krypton",
|
|
6
|
+
"Rubidium", "Strontium", "Yttrium", "Zirconium", "Niobium", "Molybdenum", "Technetium", "Ruthenium", "Rhodium", "Palladium", "Silver", "Cadmium",
|
|
7
|
+
"Indium", "Tin", "Antimony", "Tellurium", "Iodine", "Xenon",
|
|
8
|
+
"Cesium", "Barium", "Lanthanum", "Cerium", "Praseodymium", "Neodymium", "Promethium", "Samarium", "Europium", "Gadolinium", "Terbium", "Dysprosium", "Holmium", "Erbium", "Thulium", "Ytterbium", "Lutetium",
|
|
9
|
+
"Hafnium", "Tantalum", "Tungsten", "Rhenium", "Osmium", "Iridium", "Platinum", "Gold", "Mercury",
|
|
10
|
+
"Thallium", "Lead", "Bismuth", "Polonium", "Astatine", "Radon",
|
|
11
|
+
"Francium", "Radium", "Actinium", "Thorium", "Protactinium", "Uranium", "Neptunium", "Plutonium", "Americium", "Curium", "Berkelium", "Californium", "Einsteinium", "Fermium", "Mendelevium", "Nobelium", "Lawrencium",
|
|
12
|
+
"Rutherfordium", "Dubnium", "Seaborgium", "Bohrium", "Hassium", "Meitnerium", "Darmstadtium", "Roentgenium", "Copernicium", "Nihonium", "Flerovium", "Moscovium", "Livermorium", "Tennessine", "Oganesson"
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
CHEMICAL_ELEMENT_SYMBOLS = [
|
|
16
|
+
"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
|
|
17
|
+
"Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
|
|
18
|
+
"K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
|
|
19
|
+
"Ga", "Ge", "As", "Se", "Br", "Kr",
|
|
20
|
+
"Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd",
|
|
21
|
+
"In", "Sn", "Sb", "Te", "I", "Xe",
|
|
22
|
+
"Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
|
|
23
|
+
"Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
|
|
24
|
+
"Tl", "Pb", "Bi", "Po", "At", "Rn",
|
|
25
|
+
"Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
|
|
26
|
+
"Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# --- Physics & Chemistry ---
|
|
30
|
+
|
|
31
|
+
# Speed of light in vacuum (m/s)
|
|
32
|
+
SPEED_OF_LIGHT = 299792458.0
|
|
33
|
+
|
|
34
|
+
# Planck constant (J·s)
|
|
35
|
+
PLANCK_CONSTANT = 6.62607015e-34
|
|
36
|
+
|
|
37
|
+
# Avogadro's number (mol⁻¹)
|
|
38
|
+
AVOGADRO_NUMBER = 6.02214076e23
|
|
39
|
+
|
|
40
|
+
# Universal gas constant (J/(mol·K))
|
|
41
|
+
UNIVERSAL_GAS_CONSTANT = 8.314462618
|
|
42
|
+
|
|
43
|
+
# Boltzmann constant (J/K)
|
|
44
|
+
BOLTZMANN_CONSTANT = 1.380649e-23
|
|
45
|
+
|
|
46
|
+
# Gravitational constant (m³·kg⁻¹·s⁻²)
|
|
47
|
+
GRAVITATIONAL_CONSTANT = 6.67430e-11
|
|
48
|
+
|
|
49
|
+
# Standard acceleration of gravity on Earth (m/s²)
|
|
50
|
+
STANDARD_GRAVITY = 9.80665
|
|
51
|
+
|
|
52
|
+
# Elementary charge (C)
|
|
53
|
+
ELEMENTARY_CHARGE = 1.602176634e-19
|
|
54
|
+
|
|
55
|
+
# Electron mass (kg)
|
|
56
|
+
ELECTRON_MASS_KG = 9.1093837015e-31
|
|
57
|
+
|
|
58
|
+
# Proton mass (kg)
|
|
59
|
+
PROTON_MASS_KG = 1.67262192369e-27
|
|
60
|
+
|
|
61
|
+
# Absolute zero (in Celsius)
|
|
62
|
+
ABSOLUTE_ZERO_CELSIUS = -273.15
|
|
63
|
+
|
|
64
|
+
# --- Astronomy ---
|
|
65
|
+
|
|
66
|
+
# Astronomical Unit, the mean Earth-Sun distance (meters)
|
|
67
|
+
ASTRONOMICAL_UNIT_KM = 149597870.7
|
|
68
|
+
|
|
69
|
+
# Light-year (meters)
|
|
70
|
+
LIGHT_YEAR_KM = 9460730472580.8
|
|
71
|
+
|
|
72
|
+
# Earth's equatorial radius (meters)
|
|
73
|
+
EARTH_RADIUS_KM = 6378.137
|
|
74
|
+
|
|
75
|
+
# Mass of the Earth (kg)
|
|
76
|
+
EARTH_MASS_KG = 5.9722e24
|
|
77
|
+
|
|
78
|
+
# Mass of the Sun (kg)
|
|
79
|
+
SUN_MASS_KG = 1.98847e30
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/dragon_ml_toolbox.egg-info/requires.txt
RENAMED
|
File without changes
|
{dragon_ml_toolbox-10.14.0 → dragon_ml_toolbox-11.0.0}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|