dragon-ml-toolbox 10.15.0__py3-none-any.whl → 11.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.15.0
3
+ Version: 11.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -139,6 +139,7 @@ pip install "dragon-ml-toolbox[pytorch]"
139
139
  #### Modules:
140
140
 
141
141
  ```bash
142
+ constants
142
143
  custom_logger
143
144
  data_exploration
144
145
  ensemble_evaluation
@@ -176,6 +177,7 @@ pip install "dragon-ml-toolbox[mice]"
176
177
  #### Modules:
177
178
 
178
179
  ```Bash
180
+ constants
179
181
  custom_logger
180
182
  MICE_imputation
181
183
  VIF_factor
@@ -196,6 +198,7 @@ pip install "dragon-ml-toolbox[excel]"
196
198
  #### Modules:
197
199
 
198
200
  ```Bash
201
+ constants
199
202
  custom_logger
200
203
  handle_excel
201
204
  path_manager
@@ -218,6 +221,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
218
221
  #### Modules:
219
222
 
220
223
  ```Bash
224
+ constants
221
225
  custom_logger
222
226
  GUI_tools
223
227
  ensemble_inference
@@ -241,6 +245,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
241
245
  #### Modules:
242
246
 
243
247
  ```Bash
248
+ constants
244
249
  custom_logger
245
250
  GUI_tools
246
251
  ML_models
@@ -1,7 +1,7 @@
1
- dragon_ml_toolbox-10.15.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-10.15.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
3
- ml_tools/ETL_cleaning.py,sha256=ECR3UwRMovifvDkVCyqmGDGlVhWst2eJS821NsRWny8,19851
4
- ml_tools/ETL_engineering.py,sha256=a6KCWH6kRatZtjaFEF_o917ApPMK5_vRD-BjfCDAl-E,49400
1
+ dragon_ml_toolbox-11.1.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-11.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
3
+ ml_tools/ETL_cleaning.py,sha256=-JrYkT8AvkZFK-Agzhp6uVxaZXzFw49t0txjf6Z1Apw,20365
4
+ ml_tools/ETL_engineering.py,sha256=pzv1WngYzdLo6eZX_JWRRAxNB0O4RvTaZzv5oj41WWA,54565
5
5
  ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
6
6
  ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
7
7
  ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
@@ -20,6 +20,7 @@ ml_tools/VIF_factor.py,sha256=MkMh_RIdsN2XUPzKNGRiEcmB17R_MmvGV4ezpL5zD2E,10403
20
20
  ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
21
21
  ml_tools/_logger.py,sha256=wcImAiXEZKPNcwM30qBh3t7HvoPURonJY0nrgMGF0sM,4719
22
22
  ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
23
+ ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
23
24
  ml_tools/custom_logger.py,sha256=ry43hk54K6xKo8jRAgq1sFxUpOA9T0LIJ7sw0so2BW0,5880
24
25
  ml_tools/data_exploration.py,sha256=-aTi5jmv4AepPgi2k_85qEJsSLx5zPOtTbhorqzUvGQ,38542
25
26
  ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
@@ -30,7 +31,7 @@ ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
30
31
  ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
31
32
  ml_tools/path_manager.py,sha256=ke0MYOhYheRPX599GUbrvRsYHn2JKUmMDldS5LP6LQA,18431
32
33
  ml_tools/utilities.py,sha256=uheMUjQJ1zI69gASsE-mCq4KlRPVGgrgqson02rGNYM,30755
33
- dragon_ml_toolbox-10.15.0.dist-info/METADATA,sha256=2yN59s4nNgI3WbfE5l4-OyYmhjMQmB9uH3VYhjjprmI,6608
34
- dragon_ml_toolbox-10.15.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
- dragon_ml_toolbox-10.15.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
36
- dragon_ml_toolbox-10.15.0.dist-info/RECORD,,
34
+ dragon_ml_toolbox-11.1.0.dist-info/METADATA,sha256=FvLmg4zkxGRpVyf-vt5DqKpSMY9GecfVd6MAbvPBA-Q,6657
35
+ dragon_ml_toolbox-11.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
+ dragon_ml_toolbox-11.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
37
+ dragon_ml_toolbox-11.1.0.dist-info/RECORD,,
ml_tools/ETL_cleaning.py CHANGED
@@ -19,20 +19,26 @@ __all__ = [
19
19
 
20
20
 
21
21
  ################ Unique Values per column #################
22
- def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path], verbose: bool=False) -> None:
22
+ def save_unique_values(csv_path: Union[str, Path],
23
+ output_dir: Union[str, Path],
24
+ verbose: bool=False,
25
+ keep_column_order: bool = True) -> None:
23
26
  """
24
27
  Loads a CSV file, then analyzes it and saves the unique non-null values
25
28
  from each column into a separate text file exactly as they appear.
26
29
 
27
30
  This is useful for understanding the raw categories or range of values
28
- within a dataset before cleaning.
31
+ within a dataset before and after cleaning.
29
32
 
30
33
  Args:
31
- csv_path (Union[str, Path]):
34
+ csv_path (str | Path):
32
35
  The file path to the input CSV file.
33
- output_dir (Union[str, Path]):
36
+ output_dir (str | Path):
34
37
  The path to the directory where the .txt files will be saved.
35
38
  The directory will be created if it does not exist.
39
+ keep_column_order (bool):
40
+ If True, prepends a numeric prefix (e.g., '1_', '2_') to each
41
+ output filename to maintain the original column order.
36
42
  """
37
43
  # --- 1. Input Validation ---
38
44
  csv_path = make_fullpath(input_path=csv_path, enforce="file")
@@ -74,7 +80,12 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
74
80
  sanitized_name = sanitize_filename(column_name)
75
81
  if not sanitized_name.strip('_'):
76
82
  sanitized_name = f'column_{i}'
77
- file_path = output_dir / f"{sanitized_name}_unique_values.txt"
83
+
84
+ # --- create filename prefix ---
85
+ # If keep_column_order is True, create a prefix like "1_", "2_", etc.
86
+ prefix = f"{i + 1}_" if keep_column_order else ''
87
+
88
+ file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
78
89
 
79
90
  # --- Write to file ---
80
91
  try:
@@ -126,9 +137,10 @@ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
126
137
  's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
127
138
  'y': 'y', 'z': 'z',
128
139
  # Punctuation
129
- '》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']',
140
+ '》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']', '∼': '~',
130
141
  '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
131
- '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '-',
142
+ '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '', '⋅': '',
143
+ '¯': '-',
132
144
 
133
145
  # Commas (avoid commas in entries)
134
146
  ',': ';',
@@ -136,6 +148,8 @@ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
136
148
  '、':';',
137
149
 
138
150
  # Others
151
+ 'σ': '',
152
+ '□': '',
139
153
  '©': '',
140
154
  '®': '',
141
155
  '™': '',
@@ -143,7 +157,6 @@ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
143
157
 
144
158
  # Replace special characters in entries
145
159
  r'\\': '_',
146
- # '/': '_', # keep forward slash
147
160
 
148
161
  # Typographical standardization
149
162
  # Unify various dashes and hyphens to a standard hyphen
@@ -6,6 +6,7 @@ from .utilities import load_dataframe, save_dataframe
6
6
  from .path_manager import make_fullpath
7
7
  from ._script_info import _script_info
8
8
  from ._logger import _LOGGER
9
+ from .constants import CHEMICAL_ELEMENT_SYMBOLS
9
10
 
10
11
 
11
12
  __all__ = [
@@ -24,7 +25,8 @@ __all__ = [
24
25
  "CategoryMapper",
25
26
  "RegexMapper",
26
27
  "ValueBinner",
27
- "DateFeatureExtractor"
28
+ "DateFeatureExtractor",
29
+ "MolecularFormulaTransformer"
28
30
  ]
29
31
 
30
32
  ############ TRANSFORM MAIN ####################
@@ -48,17 +50,20 @@ class TransformationRecipe:
48
50
  def add(
49
51
  self,
50
52
  input_col_name: str,
51
- output_col_names: Union[str, List[str]],
52
53
  transform: Union[str, Callable],
54
+ output_col_names: Optional[Union[str, List[str]]] = None
53
55
  ) -> "TransformationRecipe":
54
56
  """
55
57
  Adds a new transformation step to the recipe.
56
58
 
57
59
  Args:
58
- input_col: The name of the column from the source DataFrame.
59
- output_col: The desired name(s) for the output column(s).
60
- A string for a 1-to-1 mapping, or a list of strings
61
- for a 1-to-many mapping.
60
+ input_col_name: The name of the column from the source DataFrame.
61
+ output_col_names: The desired name(s) for the output column(s).
62
+ - A string for a 1-to-1 mapping.
63
+ - A list of strings for a 1-to-many mapping.
64
+ - A string prefix for 1-to-many mapping.
65
+ - If None, the input name is used for 1-to-1 transforms,
66
+ or the transformer's default names are used for 1-to-many.
62
67
  transform: The transformation to apply:
63
68
  - Use "rename" for simple column renaming
64
69
  - If callable, must accept a `pl.Series` as the only parameter and return either a `pl.Series` or `pl.DataFrame`.
@@ -78,10 +83,6 @@ class TransformationRecipe:
78
83
  elif not isinstance(transform, Callable):
79
84
  _LOGGER.error(f"'transform' must be a callable function or the string '{_RENAME}'.")
80
85
  raise TypeError()
81
-
82
- if isinstance(output_col_names, list) and transform == _RENAME:
83
- _LOGGER.error("A RENAME operation cannot have a list of output columns.")
84
- raise ValueError()
85
86
 
86
87
  # --- Add Step ---
87
88
  step = {
@@ -105,7 +106,7 @@ class DataProcessor:
105
106
  """
106
107
  Transforms a Polars DataFrame based on a provided `TransformationRecipe` object.
107
108
 
108
- Use the method `transform()`.
109
+ Use the methods `transform()` or `load_transform_save()`.
109
110
  """
110
111
  def __init__(self, recipe: TransformationRecipe):
111
112
  """
@@ -148,33 +149,53 @@ class DataProcessor:
148
149
  result = transform_action(input_series)
149
150
 
150
151
  if isinstance(result, pl.Series):
151
- if not isinstance(output_col_spec, str):
152
- _LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' is not a string.")
152
+ # Default to input name if spec is None
153
+ output_name = output_col_spec if output_col_spec is not None else input_col_name
154
+
155
+ if not isinstance(output_name, str):
156
+ _LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' must be a string or None.")
153
157
  raise TypeError()
154
- processed_columns.append(result.alias(output_col_spec))
158
+ processed_columns.append(result.alias(output_name))
155
159
 
156
160
  elif isinstance(result, pl.DataFrame):
157
- # 1. Handle list-based renaming
158
- if isinstance(output_col_spec, list):
161
+ # 1. Handle None in output names
162
+ if output_col_spec is None:
163
+ # Use the column names generated by the transformer directly
164
+ processed_columns.extend(result.get_columns())
165
+
166
+ # 2. Handle list-based renaming
167
+ elif isinstance(output_col_spec, list):
159
168
  if len(result.columns) != len(output_col_spec):
160
169
  _LOGGER.error(f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, but recipe specifies {len(output_col_spec)} output names.")
161
170
  raise ValueError()
162
171
 
163
172
  renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
164
173
  processed_columns.extend(renamed_df.get_columns())
165
-
166
- # 2. Handle a string prefix for AutoDummifier
174
+
175
+ # 3. Global logic for adding a single prefix to all columns.
167
176
  elif isinstance(output_col_spec, str):
168
177
  prefix = output_col_spec
169
- # Replace the original name part with the desired prefix.
170
- new_names = {
171
- col: f"{prefix}{col[len(input_col_name):]}" for col in result.columns
172
- }
178
+ new_names = {}
179
+
180
+ for col in result.columns:
181
+ # Case 1: Transformer's output column name contains the input name.
182
+ # Action: Replace the input name with the desired prefix.
183
+ # Example: input='color', output='color_red', prefix='spec' -> 'spec_red'
184
+ if input_col_name in col:
185
+ new_names[col] = col.replace(input_col_name, prefix, 1)
186
+
187
+ # Case 2: Transformer's output is an independent name.
188
+ # Action: Prepend the prefix to the output name.
189
+ # Example: input='ratio', output='A_B', prefix='spec' -> 'spec_A_B'
190
+ else:
191
+ new_names[col] = f"{prefix}_{col}"
192
+
173
193
  renamed_df = result.rename(new_names)
174
- processed_columns.extend(renamed_df.get_columns())
194
+ processed_columns.extend(renamed_df.get_columns())
195
+
175
196
 
176
197
  else:
177
- _LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names or a string prefix.")
198
+ _LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names, a string prefix, or None.")
178
199
  raise TypeError()
179
200
 
180
201
  else:
@@ -278,7 +299,7 @@ class BinaryTransformer:
278
299
  _LOGGER.error("Provide either 'true_keywords' or 'false_keywords', but not both.")
279
300
  raise ValueError()
280
301
  if true_keywords is None and false_keywords is None:
281
- _LOGGER.error("You must provide either 'true_keywords' or 'false_keywords'.")
302
+ _LOGGER.error("Provide either 'true_keywords' or 'false_keywords'.")
282
303
  raise ValueError()
283
304
 
284
305
  # --- Configuration ---
@@ -310,16 +331,17 @@ class BinaryTransformer:
310
331
  Returns:
311
332
  pl.Series: A new Series of type UInt8 containing 1s and 0s.
312
333
  """
334
+ column_base_name = column.name
313
335
  # Create a boolean Series: True if any keyword is found, else False
314
336
  contains_keyword = column.str.contains(self.pattern)
315
337
 
316
338
  # Apply logic and cast directly to integer type
317
339
  if self.mode == "true_mode":
318
340
  # True -> 1, False -> 0
319
- return contains_keyword.cast(pl.UInt8)
341
+ return contains_keyword.cast(pl.UInt8).alias(column_base_name)
320
342
  else: # false_mode
321
343
  # We want the inverse: True -> 0, False -> 1
322
- return (~contains_keyword).cast(pl.UInt8)
344
+ return (~contains_keyword).cast(pl.UInt8).alias(column_base_name)
323
345
 
324
346
 
325
347
  class AutoDummifier:
@@ -389,11 +411,12 @@ class MultiBinaryDummifier:
389
411
  Returns:
390
412
  pl.DataFrame: A DataFrame where each column corresponds to a keyword.
391
413
  """
414
+ column_base_name = column.name
392
415
  # Ensure the input is treated as a string, preserving nulls
393
416
  str_column = column.cast(pl.Utf8)
394
417
 
395
418
  output_expressions = []
396
- for i, keyword in enumerate(self.keywords):
419
+ for keyword in self.keywords:
397
420
  # Escape keyword to treat it as a literal, not a regex pattern
398
421
  base_pattern = re.escape(keyword)
399
422
 
@@ -407,7 +430,7 @@ class MultiBinaryDummifier:
407
430
  .when(str_column.str.contains(pattern))
408
431
  .then(pl.lit(1, dtype=pl.UInt8))
409
432
  .otherwise(pl.lit(0, dtype=pl.UInt8))
410
- .alias(f"col_{i}") # Generic name for DataProcessor
433
+ .alias(f"{column_base_name}_{keyword}") # name for DataProcessor
411
434
  )
412
435
  output_expressions.append(expr)
413
436
 
@@ -451,6 +474,7 @@ class KeywordDummifier:
451
474
  Returns:
452
475
  pl.DataFrame: A DataFrame with one-hot encoded columns.
453
476
  """
477
+ column_base_name = column.name
454
478
  column = column.cast(pl.Utf8)
455
479
 
456
480
  categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None, dtype=pl.Utf8))
@@ -469,22 +493,24 @@ class KeywordDummifier:
469
493
  column.str.contains(pattern)
470
494
  ).then(pl.lit(name))
471
495
 
472
- categorize_expr = categorize_expr.otherwise(None).alias("category")
496
+ dummy_name = 'dummy_category'
497
+
498
+ categorize_expr = categorize_expr.otherwise(None).alias(dummy_name)
473
499
 
474
500
  temp_df = pl.select(categorize_expr)
475
- df_with_dummies = temp_df.to_dummies(columns=["category"])
501
+ df_with_dummies = temp_df.to_dummies(columns=[dummy_name])
476
502
 
477
503
  final_columns = []
478
504
  for name in self.group_names:
479
- dummy_col_name = f"category_{name}"
505
+ dummy_col_name = f"{dummy_name}_{name}"
480
506
  if dummy_col_name in df_with_dummies.columns:
481
- # The alias here uses the group name as the temporary column name
507
+ # The alias here uses the group name as the final column name
482
508
  final_columns.append(
483
- df_with_dummies.get_column(dummy_col_name).alias(name)
509
+ df_with_dummies.get_column(dummy_col_name).alias(f"{column_base_name}_{name}")
484
510
  )
485
511
  else:
486
512
  # If a group had no matches, create a column of zeros
487
- final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
513
+ final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(f"{column_base_name}_{name}"))
488
514
 
489
515
  return pl.select(final_columns)
490
516
 
@@ -553,6 +579,7 @@ class NumberExtractor:
553
579
  Returns:
554
580
  pl.Series: A new Series containing the extracted numbers.
555
581
  """
582
+ column_base_name = column.name
556
583
  # Extract the first (and only) capturing group
557
584
  extracted = column.str.extract(self.regex_pattern, 1)
558
585
 
@@ -563,7 +590,7 @@ class NumberExtractor:
563
590
  if self.dtype == "float" and self.round_digits is not None:
564
591
  return casted.round(self.round_digits)
565
592
 
566
- return casted
593
+ return casted.alias(column_base_name)
567
594
 
568
595
 
569
596
  class MultiNumberExtractor:
@@ -624,6 +651,7 @@ class MultiNumberExtractor:
624
651
  """
625
652
  Executes the multi-number extraction logic. Preserves nulls from the input column.
626
653
  """
654
+ column_base_name = column.name
627
655
  output_expressions = []
628
656
  for i in range(self.num_outputs):
629
657
  # Define the core extraction logic for the i-th number
@@ -643,7 +671,7 @@ class MultiNumberExtractor:
643
671
  pl.when(column.is_not_null())
644
672
  .then(extraction_expr)
645
673
  .otherwise(None)
646
- .alias(f"col_{i}") # Name the final output expression
674
+ .alias(f"{column_base_name}_{i}") # Name the final output expression
647
675
  )
648
676
 
649
677
  output_expressions.append(final_expr)
@@ -710,6 +738,7 @@ class TemperatureExtractor:
710
738
  Returns:
711
739
  pl.Series: A new Series containing the final temperature values as floats.
712
740
  """
741
+ column_base_name = column.name
713
742
  # --- Step 1: Extract number(s) to get a Celsius value expression ---
714
743
  if self.average_mode:
715
744
  # Extract all numbers and compute their mean. Polars' list.mean()
@@ -738,7 +767,7 @@ class TemperatureExtractor:
738
767
  # --- Step 3: Round the result and return as a Series ---
739
768
  # The select().to_series() pattern is a robust way to execute an
740
769
  # expression and guarantee a Series is returned.
741
- return pl.select(final_expr.round(2)).to_series()
770
+ return pl.select(final_expr.round(2)).to_series().alias(column_base_name)
742
771
 
743
772
 
744
773
  class MultiTemperatureExtractor:
@@ -799,6 +828,7 @@ class MultiTemperatureExtractor:
799
828
  """
800
829
  Applies the multi-temperature extraction and conversion logic.
801
830
  """
831
+ column_base_name = column.name
802
832
  output_expressions = []
803
833
  for i in range(self.num_outputs):
804
834
  # --- Step 1: Extract the i-th number as a Celsius value ---
@@ -829,7 +859,7 @@ class MultiTemperatureExtractor:
829
859
  pl.when(column.is_not_null())
830
860
  .then(final_expr)
831
861
  .otherwise(None)
832
- .alias(f"col_{i}") # Temporary name for DataProcessor
862
+ .alias(f"{column_base_name}_{i}") # Temporary name for DataProcessor
833
863
  )
834
864
 
835
865
  output_expressions.append(final_expr)
@@ -871,6 +901,7 @@ class RatioCalculator:
871
901
  """
872
902
  Applies the ratio calculation logic to the input column. Uses .str.extract() for maximum stability and includes optional handling for zeros and single numbers.
873
903
  """
904
+ column_base_name = column.name
874
905
  # Extract numerator (group 1) and denominator (group 2) separately.
875
906
  numerator_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
876
907
  denominator_expr = column.str.extract(self.regex_pattern, 2).cast(pl.Float64, strict=False)
@@ -908,7 +939,7 @@ class RatioCalculator:
908
939
  else:
909
940
  final_expr = ratio_expr
910
941
 
911
- return pl.select(final_expr.round(4)).to_series()
942
+ return pl.select(final_expr.round(4)).to_series().alias(column_base_name)
912
943
 
913
944
 
914
945
  class TriRatioCalculator:
@@ -949,6 +980,7 @@ class TriRatioCalculator:
949
980
  """
950
981
  Applies the robust tri-ratio logic using the lazy API.
951
982
  """
983
+ column_base_name = column.name
952
984
  # Wrap the input Series in a DataFrame to use the lazy expression API
953
985
  temp_df = column.to_frame()
954
986
 
@@ -973,8 +1005,8 @@ class TriRatioCalculator:
973
1005
 
974
1006
  # Execute the expressions and return the final DataFrame
975
1007
  return temp_df.select(
976
- A_div_B=ratio_ab_expr,
977
- A_div_C=ratio_ac_expr
1008
+ ratio_ab_expr.alias(f"{column_base_name}_A_to_B"),
1009
+ ratio_ac_expr.alias(f"{column_base_name}_A_to_C")
978
1010
  )
979
1011
 
980
1012
 
@@ -1015,6 +1047,7 @@ class CategoryMapper:
1015
1047
  Returns:
1016
1048
  pl.Series: A new Series with categories mapped to numbers.
1017
1049
  """
1050
+ column_base_name = column.name
1018
1051
  # Ensure the column is treated as a string for matching keys
1019
1052
  str_column = column.cast(pl.Utf8)
1020
1053
 
@@ -1031,7 +1064,7 @@ class CategoryMapper:
1031
1064
  pl.lit(self.default_value)
1032
1065
  )
1033
1066
 
1034
- return pl.select(final_expr).to_series()
1067
+ return pl.select(final_expr).to_series().alias(column_base_name)
1035
1068
 
1036
1069
 
1037
1070
  class RegexMapper:
@@ -1095,6 +1128,7 @@ class RegexMapper:
1095
1128
  pl.Series: A new Series with strings mapped to numbers based on
1096
1129
  the first matching regex pattern.
1097
1130
  """
1131
+ column_base_name = column.name
1098
1132
  # pl.String is the modern alias for pl.Utf8
1099
1133
  str_column = column.cast(pl.String)
1100
1134
 
@@ -1109,7 +1143,7 @@ class RegexMapper:
1109
1143
  .otherwise(mapping_expr)
1110
1144
  )
1111
1145
 
1112
- return pl.select(mapping_expr).to_series()
1146
+ return pl.select(mapping_expr).to_series().alias(column_base_name)
1113
1147
 
1114
1148
 
1115
1149
  class ValueBinner:
@@ -1159,6 +1193,7 @@ class ValueBinner:
1159
1193
  pl.Series: A new Series of integer labels for the bins. Values
1160
1194
  outside the specified breaks will become null.
1161
1195
  """
1196
+ column_base_name = column.name
1162
1197
  # `cut` creates a new column of type Categorical
1163
1198
  binned_column = column.cut(
1164
1199
  breaks=self.breaks,
@@ -1168,7 +1203,7 @@ class ValueBinner:
1168
1203
 
1169
1204
  # to_physical() converts the Categorical type to its underlying
1170
1205
  # integer representation (u32), which is perfect for ML.
1171
- return binned_column.to_physical()
1206
+ return binned_column.to_physical().alias(column_base_name)
1172
1207
 
1173
1208
 
1174
1209
  class DateFeatureExtractor:
@@ -1177,16 +1212,6 @@ class DateFeatureExtractor:
1177
1212
 
1178
1213
  It can handle columns that are already in a Polars Date/Datetime format,
1179
1214
  or it can parse string columns if a format is provided.
1180
-
1181
- Args:
1182
- features (List[str]):
1183
- A list of the date/time features to extract. Supported features are:
1184
- 'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
1185
- 'microsecond', 'nanosecond', 'ordinal_day' (day of year),
1186
- 'weekday' (Mon=1, Sun=7), 'week' (week of year), and 'timestamp'.
1187
- format (str | None):
1188
- The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
1189
- Use if the input column is not a Date or Datetime type.
1190
1215
  """
1191
1216
 
1192
1217
  ALLOWED_FEATURES = {
@@ -1199,6 +1224,17 @@ class DateFeatureExtractor:
1199
1224
  features: List[str],
1200
1225
  format: Optional[str] = None,
1201
1226
  ):
1227
+ """
1228
+ Args:
1229
+ features (List[str]):
1230
+ A list of the date/time features to extract. Supported features are:
1231
+ 'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
1232
+ 'microsecond', 'nanosecond', 'ordinal_day' (day of year),
1233
+ 'weekday' (Mon=1, Sun=7), 'week' (week of year), 'timestamp'.
1234
+ format (str | None):
1235
+ The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
1236
+ Use if the input column is not a Date or Datetime type.
1237
+ """
1202
1238
  # --- Validation ---
1203
1239
  if not isinstance(features, list) or not features:
1204
1240
  _LOGGER.error("'features' must be a non-empty list of strings.")
@@ -1222,6 +1258,7 @@ class DateFeatureExtractor:
1222
1258
  Returns:
1223
1259
  pl.DataFrame: A DataFrame with columns for each extracted feature.
1224
1260
  """
1261
+ column_base_name = column.name
1225
1262
  date_col = column
1226
1263
  # First, parse strings into a datetime object if a format is given
1227
1264
  if self.format is not None:
@@ -1237,10 +1274,81 @@ class DateFeatureExtractor:
1237
1274
  expr = getattr(date_col.dt, feature)()
1238
1275
 
1239
1276
  # Alias with a generic name for the processor to handle
1240
- output_expressions.append(expr.alias(f"col_{i}"))
1277
+ output_expressions.append(expr.alias(f"{column_base_name}_{feature}"))
1241
1278
 
1242
1279
  return pl.select(output_expressions)
1243
1280
 
1244
1281
 
1282
+ class MolecularFormulaTransformer:
1283
+ """
1284
+ Parses a Polars Series of molecular formula strings into a wide DataFrame.
1285
+
1286
+ This one-to-many transformer takes a column of condensed molecular formulas
1287
+ (e.g., 'Li0.115Mn0.529Ni0.339O2') and converts it into a DataFrame where
1288
+ each chemical element has its own column. The value in each column is the
1289
+ stoichiometric quantity of that element.
1290
+
1291
+ It is designed to be used within the DataProcessor pipeline.
1292
+ """
1293
+
1294
+ def __init__(self):
1295
+ """
1296
+ Initializes the transformer and pre-compiles the regex pattern.
1297
+ """
1298
+ # Sort symbols by length to prevent matching 'C' in 'Co'
1299
+ sorted_symbols = sorted(CHEMICAL_ELEMENT_SYMBOLS, key=len, reverse=True)
1300
+
1301
+ # Pre-compile regex for efficiency
1302
+ self.pattern = re.compile(rf'({"|".join(sorted_symbols)})(\d*\.?\d*)')
1303
+
1304
+ def __call__(self, column: pl.Series) -> pl.DataFrame:
1305
+ """
1306
+ Executes the formula parsing logic.
1307
+
1308
+ Args:
1309
+ column: A Polars Series containing strings of molecular formulas.
1310
+
1311
+ Returns:
1312
+ A Polars DataFrame with columns for every chemical element.
1313
+ """
1314
+ column_base_name = column.name
1315
+ def parse_formula(formula: str) -> dict:
1316
+ """Helper to parse a single formula string into a dictionary."""
1317
+ if not isinstance(formula, str) or not formula:
1318
+ return {}
1319
+
1320
+ matches = self.pattern.findall(formula)
1321
+
1322
+ # This dict comprehension is correct for your use case where
1323
+ # each element appears only once in the formula string.
1324
+ return {
1325
+ element: float(value) if value else 1.0
1326
+ for element, value in matches
1327
+ }
1328
+
1329
+ # Apply the parsing function to each element
1330
+ parsed_series = column.map_elements(parse_formula, return_dtype=pl.Object)
1331
+
1332
+ # Convert the Series of dictionaries into a DataFrame
1333
+ df = pl.DataFrame(parsed_series.to_list())
1334
+
1335
+ # Ensure all possible element columns are created, filling with 0
1336
+ select_expressions = []
1337
+ for symbol in CHEMICAL_ELEMENT_SYMBOLS:
1338
+ col_name = f"{column_base_name}_{symbol}"
1339
+ if symbol in df.columns:
1340
+ expr = pl.col(symbol).fill_null(0).alias(col_name)
1341
+ else:
1342
+ expr = pl.lit(0.0, dtype=pl.Float64).alias(col_name)
1343
+ select_expressions.append(expr)
1344
+
1345
+ # Handle edge case where input series is not empty but parsing yields no rows
1346
+ base_df = df
1347
+ if df.height == 0 and column.len() > 0:
1348
+ base_df = pl.DataFrame({'dummy': range(column.len())})
1349
+
1350
+ return base_df.select(select_expressions)
1351
+
1352
+
1245
1353
  def info():
1246
1354
  _script_info(__all__)
ml_tools/constants.py ADDED
@@ -0,0 +1,79 @@
1
+ CHEMICAL_ELEMENTS = [
2
+ "Hydrogen", "Helium", "Lithium", "Beryllium", "Boron", "Carbon", "Nitrogen", "Oxygen", "Fluorine", "Neon",
3
+ "Sodium", "Magnesium", "Aluminum", "Silicon", "Phosphorus", "Sulfur", "Chlorine", "Argon",
4
+ "Potassium", "Calcium", "Scandium", "Titanium", "Vanadium", "Chromium", "Manganese", "Iron", "Cobalt", "Nickel", "Copper", "Zinc",
5
+ "Gallium", "Germanium", "Arsenic", "Selenium", "Bromine", "Krypton",
6
+ "Rubidium", "Strontium", "Yttrium", "Zirconium", "Niobium", "Molybdenum", "Technetium", "Ruthenium", "Rhodium", "Palladium", "Silver", "Cadmium",
7
+ "Indium", "Tin", "Antimony", "Tellurium", "Iodine", "Xenon",
8
+ "Cesium", "Barium", "Lanthanum", "Cerium", "Praseodymium", "Neodymium", "Promethium", "Samarium", "Europium", "Gadolinium", "Terbium", "Dysprosium", "Holmium", "Erbium", "Thulium", "Ytterbium", "Lutetium",
9
+ "Hafnium", "Tantalum", "Tungsten", "Rhenium", "Osmium", "Iridium", "Platinum", "Gold", "Mercury",
10
+ "Thallium", "Lead", "Bismuth", "Polonium", "Astatine", "Radon",
11
+ "Francium", "Radium", "Actinium", "Thorium", "Protactinium", "Uranium", "Neptunium", "Plutonium", "Americium", "Curium", "Berkelium", "Californium", "Einsteinium", "Fermium", "Mendelevium", "Nobelium", "Lawrencium",
12
+ "Rutherfordium", "Dubnium", "Seaborgium", "Bohrium", "Hassium", "Meitnerium", "Darmstadtium", "Roentgenium", "Copernicium", "Nihonium", "Flerovium", "Moscovium", "Livermorium", "Tennessine", "Oganesson"
13
+ ]
14
+
15
+ CHEMICAL_ELEMENT_SYMBOLS = [
16
+ "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
17
+ "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
18
+ "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
19
+ "Ga", "Ge", "As", "Se", "Br", "Kr",
20
+ "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd",
21
+ "In", "Sn", "Sb", "Te", "I", "Xe",
22
+ "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
23
+ "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
24
+ "Tl", "Pb", "Bi", "Po", "At", "Rn",
25
+ "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
26
+ "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
27
+ ]
28
+
29
+ # --- Physics & Chemistry ---
30
+
31
+ # Speed of light in vacuum (m/s)
32
+ SPEED_OF_LIGHT = 299792458.0
33
+
34
+ # Planck constant (J·s)
35
+ PLANCK_CONSTANT = 6.62607015e-34
36
+
37
+ # Avogadro's number (mol⁻¹)
38
+ AVOGADRO_NUMBER = 6.02214076e23
39
+
40
+ # Universal gas constant (J/(mol·K))
41
+ UNIVERSAL_GAS_CONSTANT = 8.314462618
42
+
43
+ # Boltzmann constant (J/K)
44
+ BOLTZMANN_CONSTANT = 1.380649e-23
45
+
46
+ # Gravitational constant (m³·kg⁻¹·s⁻²)
47
+ GRAVITATIONAL_CONSTANT = 6.67430e-11
48
+
49
+ # Standard acceleration of gravity on Earth (m/s²)
50
+ STANDARD_GRAVITY = 9.80665
51
+
52
+ # Elementary charge (C)
53
+ ELEMENTARY_CHARGE = 1.602176634e-19
54
+
55
+ # Electron mass (kg)
56
+ ELECTRON_MASS_KG = 9.1093837015e-31
57
+
58
+ # Proton mass (kg)
59
+ PROTON_MASS_KG = 1.67262192369e-27
60
+
61
+ # Absolute zero (in Celsius)
62
+ ABSOLUTE_ZERO_CELSIUS = -273.15
63
+
64
+ # --- Astronomy ---
65
+
66
+ # Astronomical Unit, the mean Earth-Sun distance (meters)
67
+ ASTRONOMICAL_UNIT_KM = 149597870.7
68
+
69
+ # Light-year (meters)
70
+ LIGHT_YEAR_KM = 9460730472580.8
71
+
72
+ # Earth's equatorial radius (meters)
73
+ EARTH_RADIUS_KM = 6378.137
74
+
75
+ # Mass of the Earth (kg)
76
+ EARTH_MASS_KG = 5.9722e24
77
+
78
+ # Mass of the Sun (kg)
79
+ SUN_MASS_KG = 1.98847e30