dragon-ml-toolbox 11.0.0__py3-none-any.whl → 11.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 11.0.0
3
+ Version: 11.1.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,7 +1,7 @@
1
- dragon_ml_toolbox-11.0.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-11.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
3
- ml_tools/ETL_cleaning.py,sha256=m-Zo37RhZCZnOhby2at2fDJei8KqLGbr6wRugI9LVKI,20366
4
- ml_tools/ETL_engineering.py,sha256=cefk-Kn2ljs4laWHFtlm7K8UKvSuhrzkSzz9kbU_wwE,54074
1
+ dragon_ml_toolbox-11.1.1.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-11.1.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
3
+ ml_tools/ETL_cleaning.py,sha256=-JrYkT8AvkZFK-Agzhp6uVxaZXzFw49t0txjf6Z1Apw,20365
4
+ ml_tools/ETL_engineering.py,sha256=9dmNd2e3fUldwhIggogGKPlxTb02rtb463Kq5QHnqJo,54551
5
5
  ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
6
6
  ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
7
7
  ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
@@ -31,7 +31,7 @@ ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
31
31
  ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
32
32
  ml_tools/path_manager.py,sha256=ke0MYOhYheRPX599GUbrvRsYHn2JKUmMDldS5LP6LQA,18431
33
33
  ml_tools/utilities.py,sha256=uheMUjQJ1zI69gASsE-mCq4KlRPVGgrgqson02rGNYM,30755
34
- dragon_ml_toolbox-11.0.0.dist-info/METADATA,sha256=Ag-JWZUXEh8hr_R7j5kknLTVhVZxcoBCm7aX6I4PCmM,6657
35
- dragon_ml_toolbox-11.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
- dragon_ml_toolbox-11.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
37
- dragon_ml_toolbox-11.0.0.dist-info/RECORD,,
34
+ dragon_ml_toolbox-11.1.1.dist-info/METADATA,sha256=Vl_AVzC58IA6OESD3NQTPADls7o_eN5dl-s2qKdWBZI,6657
35
+ dragon_ml_toolbox-11.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
+ dragon_ml_toolbox-11.1.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
37
+ dragon_ml_toolbox-11.1.1.dist-info/RECORD,,
ml_tools/ETL_cleaning.py CHANGED
@@ -22,7 +22,7 @@ __all__ = [
22
22
  def save_unique_values(csv_path: Union[str, Path],
23
23
  output_dir: Union[str, Path],
24
24
  verbose: bool=False,
25
- keep_column_order: bool = False) -> None:
25
+ keep_column_order: bool = True) -> None:
26
26
  """
27
27
  Loads a CSV file, then analyzes it and saves the unique non-null values
28
28
  from each column into a separate text file exactly as they appear.
@@ -106,7 +106,7 @@ class DataProcessor:
106
106
  """
107
107
  Transforms a Polars DataFrame based on a provided `TransformationRecipe` object.
108
108
 
109
- Use the method `transform()`.
109
+ Use the methods `transform()` or `load_transform_save()`.
110
110
  """
111
111
  def __init__(self, recipe: TransformationRecipe):
112
112
  """
@@ -186,7 +186,7 @@ class DataProcessor:
186
186
 
187
187
  # Case 2: Transformer's output is an independent name.
188
188
  # Action: Prepend the prefix to the output name.
189
- # Example: input='ratio', output='A_div_B', prefix='spec' -> 'spec_A_div_B'
189
+ # Example: input='ratio', output='A_B', prefix='spec' -> 'spec_A_B'
190
190
  else:
191
191
  new_names[col] = f"{prefix}_{col}"
192
192
 
@@ -299,7 +299,7 @@ class BinaryTransformer:
299
299
  _LOGGER.error("Provide either 'true_keywords' or 'false_keywords', but not both.")
300
300
  raise ValueError()
301
301
  if true_keywords is None and false_keywords is None:
302
- _LOGGER.error("You must provide either 'true_keywords' or 'false_keywords'.")
302
+ _LOGGER.error("Provide either 'true_keywords' or 'false_keywords'.")
303
303
  raise ValueError()
304
304
 
305
305
  # --- Configuration ---
@@ -331,16 +331,17 @@ class BinaryTransformer:
331
331
  Returns:
332
332
  pl.Series: A new Series of type UInt8 containing 1s and 0s.
333
333
  """
334
+ column_base_name = column.name
334
335
  # Create a boolean Series: True if any keyword is found, else False
335
336
  contains_keyword = column.str.contains(self.pattern)
336
337
 
337
338
  # Apply logic and cast directly to integer type
338
339
  if self.mode == "true_mode":
339
340
  # True -> 1, False -> 0
340
- return contains_keyword.cast(pl.UInt8)
341
+ return contains_keyword.cast(pl.UInt8).alias(column_base_name)
341
342
  else: # false_mode
342
343
  # We want the inverse: True -> 0, False -> 1
343
- return (~contains_keyword).cast(pl.UInt8)
344
+ return (~contains_keyword).cast(pl.UInt8).alias(column_base_name)
344
345
 
345
346
 
346
347
  class AutoDummifier:
@@ -410,11 +411,12 @@ class MultiBinaryDummifier:
410
411
  Returns:
411
412
  pl.DataFrame: A DataFrame where each column corresponds to a keyword.
412
413
  """
414
+ column_base_name = column.name
413
415
  # Ensure the input is treated as a string, preserving nulls
414
416
  str_column = column.cast(pl.Utf8)
415
417
 
416
418
  output_expressions = []
417
- for i, keyword in enumerate(self.keywords):
419
+ for keyword in self.keywords:
418
420
  # Escape keyword to treat it as a literal, not a regex pattern
419
421
  base_pattern = re.escape(keyword)
420
422
 
@@ -428,7 +430,7 @@ class MultiBinaryDummifier:
428
430
  .when(str_column.str.contains(pattern))
429
431
  .then(pl.lit(1, dtype=pl.UInt8))
430
432
  .otherwise(pl.lit(0, dtype=pl.UInt8))
431
- .alias(f"col_{i}") # Generic name for DataProcessor
433
+ .alias(f"{column_base_name}_{keyword}") # name for DataProcessor
432
434
  )
433
435
  output_expressions.append(expr)
434
436
 
@@ -472,6 +474,7 @@ class KeywordDummifier:
472
474
  Returns:
473
475
  pl.DataFrame: A DataFrame with one-hot encoded columns.
474
476
  """
477
+ column_base_name = column.name
475
478
  column = column.cast(pl.Utf8)
476
479
 
477
480
  categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None, dtype=pl.Utf8))
@@ -490,22 +493,24 @@ class KeywordDummifier:
490
493
  column.str.contains(pattern)
491
494
  ).then(pl.lit(name))
492
495
 
493
- categorize_expr = categorize_expr.otherwise(None).alias("category")
496
+ dummy_name = 'dummy_category'
497
+
498
+ categorize_expr = categorize_expr.otherwise(None).alias(dummy_name)
494
499
 
495
500
  temp_df = pl.select(categorize_expr)
496
- df_with_dummies = temp_df.to_dummies(columns=["category"])
501
+ df_with_dummies = temp_df.to_dummies(columns=[dummy_name])
497
502
 
498
503
  final_columns = []
499
504
  for name in self.group_names:
500
- dummy_col_name = f"category_{name}"
505
+ dummy_col_name = f"{dummy_name}_{name}"
501
506
  if dummy_col_name in df_with_dummies.columns:
502
- # The alias here uses the group name as the temporary column name
507
+ # The alias here uses the group name as the final column name
503
508
  final_columns.append(
504
- df_with_dummies.get_column(dummy_col_name).alias(name)
509
+ df_with_dummies.get_column(dummy_col_name).alias(f"{column_base_name}_{name}")
505
510
  )
506
511
  else:
507
512
  # If a group had no matches, create a column of zeros
508
- final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
513
+ final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(f"{column_base_name}_{name}"))
509
514
 
510
515
  return pl.select(final_columns)
511
516
 
@@ -526,7 +531,7 @@ class NumberExtractor:
526
531
  round_digits (int | None):
527
532
  If the dtype is 'float', you can specify the number of decimal
528
533
  places to round the result to. This parameter is ignored if
529
- dtype is 'int'. Defaults to None (no rounding).
534
+ dtype is 'int'.
530
535
  """
531
536
  def __init__(
532
537
  self,
@@ -574,6 +579,7 @@ class NumberExtractor:
574
579
  Returns:
575
580
  pl.Series: A new Series containing the extracted numbers.
576
581
  """
582
+ column_base_name = column.name
577
583
  # Extract the first (and only) capturing group
578
584
  extracted = column.str.extract(self.regex_pattern, 1)
579
585
 
@@ -584,7 +590,7 @@ class NumberExtractor:
584
590
  if self.dtype == "float" and self.round_digits is not None:
585
591
  return casted.round(self.round_digits)
586
592
 
587
- return casted
593
+ return casted.alias(column_base_name)
588
594
 
589
595
 
590
596
  class MultiNumberExtractor:
@@ -645,12 +651,13 @@ class MultiNumberExtractor:
645
651
  """
646
652
  Executes the multi-number extraction logic. Preserves nulls from the input column.
647
653
  """
654
+ column_base_name = column.name
648
655
  output_expressions = []
649
656
  for i in range(self.num_outputs):
650
657
  # Define the core extraction logic for the i-th number
651
658
  extraction_expr = (
652
659
  column.str.extract_all(self.regex_pattern)
653
- .list.get(i)
660
+ .list.get(i, null_on_oob=True)
654
661
  .cast(self.polars_dtype, strict=False)
655
662
  )
656
663
 
@@ -664,7 +671,7 @@ class MultiNumberExtractor:
664
671
  pl.when(column.is_not_null())
665
672
  .then(extraction_expr)
666
673
  .otherwise(None)
667
- .alias(f"col_{i}") # Name the final output expression
674
+ .alias(f"{column_base_name}_{i}") # Name the final output expression
668
675
  )
669
676
 
670
677
  output_expressions.append(final_expr)
@@ -731,6 +738,7 @@ class TemperatureExtractor:
731
738
  Returns:
732
739
  pl.Series: A new Series containing the final temperature values as floats.
733
740
  """
741
+ column_base_name = column.name
734
742
  # --- Step 1: Extract number(s) to get a Celsius value expression ---
735
743
  if self.average_mode:
736
744
  # Extract all numbers and compute their mean. Polars' list.mean()
@@ -759,7 +767,7 @@ class TemperatureExtractor:
759
767
  # --- Step 3: Round the result and return as a Series ---
760
768
  # The select().to_series() pattern is a robust way to execute an
761
769
  # expression and guarantee a Series is returned.
762
- return pl.select(final_expr.round(2)).to_series()
770
+ return pl.select(final_expr.round(2)).to_series().alias(column_base_name)
763
771
 
764
772
 
765
773
  class MultiTemperatureExtractor:
@@ -820,6 +828,7 @@ class MultiTemperatureExtractor:
820
828
  """
821
829
  Applies the multi-temperature extraction and conversion logic.
822
830
  """
831
+ column_base_name = column.name
823
832
  output_expressions = []
824
833
  for i in range(self.num_outputs):
825
834
  # --- Step 1: Extract the i-th number as a Celsius value ---
@@ -850,7 +859,7 @@ class MultiTemperatureExtractor:
850
859
  pl.when(column.is_not_null())
851
860
  .then(final_expr)
852
861
  .otherwise(None)
853
- .alias(f"col_{i}") # Temporary name for DataProcessor
862
+ .alias(f"{column_base_name}_{i}") # Temporary name for DataProcessor
854
863
  )
855
864
 
856
865
  output_expressions.append(final_expr)
@@ -892,6 +901,7 @@ class RatioCalculator:
892
901
  """
893
902
  Applies the ratio calculation logic to the input column. Uses .str.extract() for maximum stability and includes optional handling for zeros and single numbers.
894
903
  """
904
+ column_base_name = column.name
895
905
  # Extract numerator (group 1) and denominator (group 2) separately.
896
906
  numerator_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
897
907
  denominator_expr = column.str.extract(self.regex_pattern, 2).cast(pl.Float64, strict=False)
@@ -929,7 +939,7 @@ class RatioCalculator:
929
939
  else:
930
940
  final_expr = ratio_expr
931
941
 
932
- return pl.select(final_expr.round(4)).to_series()
942
+ return pl.select(final_expr.round(4)).to_series().alias(column_base_name)
933
943
 
934
944
 
935
945
  class TriRatioCalculator:
@@ -970,6 +980,7 @@ class TriRatioCalculator:
970
980
  """
971
981
  Applies the robust tri-ratio logic using the lazy API.
972
982
  """
983
+ column_base_name = column.name
973
984
  # Wrap the input Series in a DataFrame to use the lazy expression API
974
985
  temp_df = column.to_frame()
975
986
 
@@ -994,8 +1005,8 @@ class TriRatioCalculator:
994
1005
 
995
1006
  # Execute the expressions and return the final DataFrame
996
1007
  return temp_df.select(
997
- A_div_B=ratio_ab_expr,
998
- A_div_C=ratio_ac_expr
1008
+ ratio_ab_expr.alias(f"{column_base_name}_A_to_B"),
1009
+ ratio_ac_expr.alias(f"{column_base_name}_A_to_C")
999
1010
  )
1000
1011
 
1001
1012
 
@@ -1036,6 +1047,7 @@ class CategoryMapper:
1036
1047
  Returns:
1037
1048
  pl.Series: A new Series with categories mapped to numbers.
1038
1049
  """
1050
+ column_base_name = column.name
1039
1051
  # Ensure the column is treated as a string for matching keys
1040
1052
  str_column = column.cast(pl.Utf8)
1041
1053
 
@@ -1052,7 +1064,7 @@ class CategoryMapper:
1052
1064
  pl.lit(self.default_value)
1053
1065
  )
1054
1066
 
1055
- return pl.select(final_expr).to_series()
1067
+ return pl.select(final_expr).to_series().alias(column_base_name)
1056
1068
 
1057
1069
 
1058
1070
  class RegexMapper:
@@ -1116,6 +1128,7 @@ class RegexMapper:
1116
1128
  pl.Series: A new Series with strings mapped to numbers based on
1117
1129
  the first matching regex pattern.
1118
1130
  """
1131
+ column_base_name = column.name
1119
1132
  # pl.String is the modern alias for pl.Utf8
1120
1133
  str_column = column.cast(pl.String)
1121
1134
 
@@ -1130,7 +1143,7 @@ class RegexMapper:
1130
1143
  .otherwise(mapping_expr)
1131
1144
  )
1132
1145
 
1133
- return pl.select(mapping_expr).to_series()
1146
+ return pl.select(mapping_expr).to_series().alias(column_base_name)
1134
1147
 
1135
1148
 
1136
1149
  class ValueBinner:
@@ -1180,6 +1193,7 @@ class ValueBinner:
1180
1193
  pl.Series: A new Series of integer labels for the bins. Values
1181
1194
  outside the specified breaks will become null.
1182
1195
  """
1196
+ column_base_name = column.name
1183
1197
  # `cut` creates a new column of type Categorical
1184
1198
  binned_column = column.cut(
1185
1199
  breaks=self.breaks,
@@ -1189,7 +1203,7 @@ class ValueBinner:
1189
1203
 
1190
1204
  # to_physical() converts the Categorical type to its underlying
1191
1205
  # integer representation (u32), which is perfect for ML.
1192
- return binned_column.to_physical()
1206
+ return binned_column.to_physical().alias(column_base_name)
1193
1207
 
1194
1208
 
1195
1209
  class DateFeatureExtractor:
@@ -1198,16 +1212,6 @@ class DateFeatureExtractor:
1198
1212
 
1199
1213
  It can handle columns that are already in a Polars Date/Datetime format,
1200
1214
  or it can parse string columns if a format is provided.
1201
-
1202
- Args:
1203
- features (List[str]):
1204
- A list of the date/time features to extract. Supported features are:
1205
- 'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
1206
- 'microsecond', 'nanosecond', 'ordinal_day' (day of year),
1207
- 'weekday' (Mon=1, Sun=7), 'week' (week of year), and 'timestamp'.
1208
- format (str | None):
1209
- The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
1210
- Use if the input column is not a Date or Datetime type.
1211
1215
  """
1212
1216
 
1213
1217
  ALLOWED_FEATURES = {
@@ -1220,6 +1224,17 @@ class DateFeatureExtractor:
1220
1224
  features: List[str],
1221
1225
  format: Optional[str] = None,
1222
1226
  ):
1227
+ """
1228
+ Args:
1229
+ features (List[str]):
1230
+ A list of the date/time features to extract. Supported features are:
1231
+ 'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
1232
+ 'microsecond', 'nanosecond', 'ordinal_day' (day of year),
1233
+ 'weekday' (Mon=1, Sun=7), 'week' (week of year), 'timestamp'.
1234
+ format (str | None):
1235
+ The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
1236
+ Use if the input column is not a Date or Datetime type.
1237
+ """
1223
1238
  # --- Validation ---
1224
1239
  if not isinstance(features, list) or not features:
1225
1240
  _LOGGER.error("'features' must be a non-empty list of strings.")
@@ -1243,6 +1258,7 @@ class DateFeatureExtractor:
1243
1258
  Returns:
1244
1259
  pl.DataFrame: A DataFrame with columns for each extracted feature.
1245
1260
  """
1261
+ column_base_name = column.name
1246
1262
  date_col = column
1247
1263
  # First, parse strings into a datetime object if a format is given
1248
1264
  if self.format is not None:
@@ -1258,7 +1274,7 @@ class DateFeatureExtractor:
1258
1274
  expr = getattr(date_col.dt, feature)()
1259
1275
 
1260
1276
  # Alias with a generic name for the processor to handle
1261
- output_expressions.append(expr.alias(f"col_{i}"))
1277
+ output_expressions.append(expr.alias(f"{column_base_name}_{feature}"))
1262
1278
 
1263
1279
  return pl.select(output_expressions)
1264
1280
 
@@ -1275,20 +1291,10 @@ class MolecularFormulaTransformer:
1275
1291
  It is designed to be used within the DataProcessor pipeline.
1276
1292
  """
1277
1293
 
1278
- def __init__(self, prefix: str = "Fraction", separator: str = "_"):
1294
+ def __init__(self):
1279
1295
  """
1280
1296
  Initializes the transformer and pre-compiles the regex pattern.
1281
-
1282
- Args:
1283
- prefix (str): The prefix for the output column names. Defaults to "Fraction".
1284
- separator (str): The separator between the prefix and element symbol. Defaults to "_".
1285
1297
  """
1286
- if not isinstance(prefix, str) or not isinstance(separator, str):
1287
- _LOGGER.error("'prefix' and 'separator' must be strings.")
1288
- raise TypeError()
1289
-
1290
- self.prefix = prefix
1291
- self.separator = separator
1292
1298
  # Sort symbols by length to prevent matching 'C' in 'Co'
1293
1299
  sorted_symbols = sorted(CHEMICAL_ELEMENT_SYMBOLS, key=len, reverse=True)
1294
1300
 
@@ -1305,6 +1311,7 @@ class MolecularFormulaTransformer:
1305
1311
  Returns:
1306
1312
  A Polars DataFrame with columns for every chemical element.
1307
1313
  """
1314
+ column_base_name = column.name
1308
1315
  def parse_formula(formula: str) -> dict:
1309
1316
  """Helper to parse a single formula string into a dictionary."""
1310
1317
  if not isinstance(formula, str) or not formula:
@@ -1328,7 +1335,7 @@ class MolecularFormulaTransformer:
1328
1335
  # Ensure all possible element columns are created, filling with 0
1329
1336
  select_expressions = []
1330
1337
  for symbol in CHEMICAL_ELEMENT_SYMBOLS:
1331
- col_name = f"{self.prefix}{self.separator}{symbol}"
1338
+ col_name = f"{column_base_name}_{symbol}"
1332
1339
  if symbol in df.columns:
1333
1340
  expr = pl.col(symbol).fill_null(0).alias(col_name)
1334
1341
  else: