dragon-ml-toolbox 11.0.0__tar.gz → 11.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-11.0.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-11.1.1}/PKG-INFO +1 -1
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ETL_cleaning.py +1 -1
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ETL_engineering.py +55 -48
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/pyproject.toml +1 -1
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/LICENSE +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/README.md +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ML_datasetmaster.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ML_evaluation_multi.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ML_inference.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ML_models.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ML_optimization.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ML_scaler.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/SQL.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/_logger.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/_script_info.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/constants.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/custom_logger.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/data_exploration.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ensemble_inference.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/optimization_tools.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/path_manager.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/setup.cfg +0 -0
|
@@ -22,7 +22,7 @@ __all__ = [
|
|
|
22
22
|
def save_unique_values(csv_path: Union[str, Path],
|
|
23
23
|
output_dir: Union[str, Path],
|
|
24
24
|
verbose: bool=False,
|
|
25
|
-
keep_column_order: bool =
|
|
25
|
+
keep_column_order: bool = True) -> None:
|
|
26
26
|
"""
|
|
27
27
|
Loads a CSV file, then analyzes it and saves the unique non-null values
|
|
28
28
|
from each column into a separate text file exactly as they appear.
|
|
@@ -106,7 +106,7 @@ class DataProcessor:
|
|
|
106
106
|
"""
|
|
107
107
|
Transforms a Polars DataFrame based on a provided `TransformationRecipe` object.
|
|
108
108
|
|
|
109
|
-
Use the
|
|
109
|
+
Use the methods `transform()` or `load_transform_save()`.
|
|
110
110
|
"""
|
|
111
111
|
def __init__(self, recipe: TransformationRecipe):
|
|
112
112
|
"""
|
|
@@ -186,7 +186,7 @@ class DataProcessor:
|
|
|
186
186
|
|
|
187
187
|
# Case 2: Transformer's output is an independent name.
|
|
188
188
|
# Action: Prepend the prefix to the output name.
|
|
189
|
-
# Example: input='ratio', output='
|
|
189
|
+
# Example: input='ratio', output='A_B', prefix='spec' -> 'spec_A_B'
|
|
190
190
|
else:
|
|
191
191
|
new_names[col] = f"{prefix}_{col}"
|
|
192
192
|
|
|
@@ -299,7 +299,7 @@ class BinaryTransformer:
|
|
|
299
299
|
_LOGGER.error("Provide either 'true_keywords' or 'false_keywords', but not both.")
|
|
300
300
|
raise ValueError()
|
|
301
301
|
if true_keywords is None and false_keywords is None:
|
|
302
|
-
_LOGGER.error("
|
|
302
|
+
_LOGGER.error("Provide either 'true_keywords' or 'false_keywords'.")
|
|
303
303
|
raise ValueError()
|
|
304
304
|
|
|
305
305
|
# --- Configuration ---
|
|
@@ -331,16 +331,17 @@ class BinaryTransformer:
|
|
|
331
331
|
Returns:
|
|
332
332
|
pl.Series: A new Series of type UInt8 containing 1s and 0s.
|
|
333
333
|
"""
|
|
334
|
+
column_base_name = column.name
|
|
334
335
|
# Create a boolean Series: True if any keyword is found, else False
|
|
335
336
|
contains_keyword = column.str.contains(self.pattern)
|
|
336
337
|
|
|
337
338
|
# Apply logic and cast directly to integer type
|
|
338
339
|
if self.mode == "true_mode":
|
|
339
340
|
# True -> 1, False -> 0
|
|
340
|
-
return contains_keyword.cast(pl.UInt8)
|
|
341
|
+
return contains_keyword.cast(pl.UInt8).alias(column_base_name)
|
|
341
342
|
else: # false_mode
|
|
342
343
|
# We want the inverse: True -> 0, False -> 1
|
|
343
|
-
return (~contains_keyword).cast(pl.UInt8)
|
|
344
|
+
return (~contains_keyword).cast(pl.UInt8).alias(column_base_name)
|
|
344
345
|
|
|
345
346
|
|
|
346
347
|
class AutoDummifier:
|
|
@@ -410,11 +411,12 @@ class MultiBinaryDummifier:
|
|
|
410
411
|
Returns:
|
|
411
412
|
pl.DataFrame: A DataFrame where each column corresponds to a keyword.
|
|
412
413
|
"""
|
|
414
|
+
column_base_name = column.name
|
|
413
415
|
# Ensure the input is treated as a string, preserving nulls
|
|
414
416
|
str_column = column.cast(pl.Utf8)
|
|
415
417
|
|
|
416
418
|
output_expressions = []
|
|
417
|
-
for
|
|
419
|
+
for keyword in self.keywords:
|
|
418
420
|
# Escape keyword to treat it as a literal, not a regex pattern
|
|
419
421
|
base_pattern = re.escape(keyword)
|
|
420
422
|
|
|
@@ -428,7 +430,7 @@ class MultiBinaryDummifier:
|
|
|
428
430
|
.when(str_column.str.contains(pattern))
|
|
429
431
|
.then(pl.lit(1, dtype=pl.UInt8))
|
|
430
432
|
.otherwise(pl.lit(0, dtype=pl.UInt8))
|
|
431
|
-
.alias(f"
|
|
433
|
+
.alias(f"{column_base_name}_{keyword}") # name for DataProcessor
|
|
432
434
|
)
|
|
433
435
|
output_expressions.append(expr)
|
|
434
436
|
|
|
@@ -472,6 +474,7 @@ class KeywordDummifier:
|
|
|
472
474
|
Returns:
|
|
473
475
|
pl.DataFrame: A DataFrame with one-hot encoded columns.
|
|
474
476
|
"""
|
|
477
|
+
column_base_name = column.name
|
|
475
478
|
column = column.cast(pl.Utf8)
|
|
476
479
|
|
|
477
480
|
categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None, dtype=pl.Utf8))
|
|
@@ -490,22 +493,24 @@ class KeywordDummifier:
|
|
|
490
493
|
column.str.contains(pattern)
|
|
491
494
|
).then(pl.lit(name))
|
|
492
495
|
|
|
493
|
-
|
|
496
|
+
dummy_name = 'dummy_category'
|
|
497
|
+
|
|
498
|
+
categorize_expr = categorize_expr.otherwise(None).alias(dummy_name)
|
|
494
499
|
|
|
495
500
|
temp_df = pl.select(categorize_expr)
|
|
496
|
-
df_with_dummies = temp_df.to_dummies(columns=[
|
|
501
|
+
df_with_dummies = temp_df.to_dummies(columns=[dummy_name])
|
|
497
502
|
|
|
498
503
|
final_columns = []
|
|
499
504
|
for name in self.group_names:
|
|
500
|
-
dummy_col_name = f"
|
|
505
|
+
dummy_col_name = f"{dummy_name}_{name}"
|
|
501
506
|
if dummy_col_name in df_with_dummies.columns:
|
|
502
|
-
# The alias here uses the group name as the
|
|
507
|
+
# The alias here uses the group name as the final column name
|
|
503
508
|
final_columns.append(
|
|
504
|
-
df_with_dummies.get_column(dummy_col_name).alias(name)
|
|
509
|
+
df_with_dummies.get_column(dummy_col_name).alias(f"{column_base_name}_{name}")
|
|
505
510
|
)
|
|
506
511
|
else:
|
|
507
512
|
# If a group had no matches, create a column of zeros
|
|
508
|
-
final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
|
|
513
|
+
final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(f"{column_base_name}_{name}"))
|
|
509
514
|
|
|
510
515
|
return pl.select(final_columns)
|
|
511
516
|
|
|
@@ -526,7 +531,7 @@ class NumberExtractor:
|
|
|
526
531
|
round_digits (int | None):
|
|
527
532
|
If the dtype is 'float', you can specify the number of decimal
|
|
528
533
|
places to round the result to. This parameter is ignored if
|
|
529
|
-
dtype is 'int'.
|
|
534
|
+
dtype is 'int'.
|
|
530
535
|
"""
|
|
531
536
|
def __init__(
|
|
532
537
|
self,
|
|
@@ -574,6 +579,7 @@ class NumberExtractor:
|
|
|
574
579
|
Returns:
|
|
575
580
|
pl.Series: A new Series containing the extracted numbers.
|
|
576
581
|
"""
|
|
582
|
+
column_base_name = column.name
|
|
577
583
|
# Extract the first (and only) capturing group
|
|
578
584
|
extracted = column.str.extract(self.regex_pattern, 1)
|
|
579
585
|
|
|
@@ -584,7 +590,7 @@ class NumberExtractor:
|
|
|
584
590
|
if self.dtype == "float" and self.round_digits is not None:
|
|
585
591
|
return casted.round(self.round_digits)
|
|
586
592
|
|
|
587
|
-
return casted
|
|
593
|
+
return casted.alias(column_base_name)
|
|
588
594
|
|
|
589
595
|
|
|
590
596
|
class MultiNumberExtractor:
|
|
@@ -645,12 +651,13 @@ class MultiNumberExtractor:
|
|
|
645
651
|
"""
|
|
646
652
|
Executes the multi-number extraction logic. Preserves nulls from the input column.
|
|
647
653
|
"""
|
|
654
|
+
column_base_name = column.name
|
|
648
655
|
output_expressions = []
|
|
649
656
|
for i in range(self.num_outputs):
|
|
650
657
|
# Define the core extraction logic for the i-th number
|
|
651
658
|
extraction_expr = (
|
|
652
659
|
column.str.extract_all(self.regex_pattern)
|
|
653
|
-
.list.get(i)
|
|
660
|
+
.list.get(i, null_on_oob=True)
|
|
654
661
|
.cast(self.polars_dtype, strict=False)
|
|
655
662
|
)
|
|
656
663
|
|
|
@@ -664,7 +671,7 @@ class MultiNumberExtractor:
|
|
|
664
671
|
pl.when(column.is_not_null())
|
|
665
672
|
.then(extraction_expr)
|
|
666
673
|
.otherwise(None)
|
|
667
|
-
.alias(f"
|
|
674
|
+
.alias(f"{column_base_name}_{i}") # Name the final output expression
|
|
668
675
|
)
|
|
669
676
|
|
|
670
677
|
output_expressions.append(final_expr)
|
|
@@ -731,6 +738,7 @@ class TemperatureExtractor:
|
|
|
731
738
|
Returns:
|
|
732
739
|
pl.Series: A new Series containing the final temperature values as floats.
|
|
733
740
|
"""
|
|
741
|
+
column_base_name = column.name
|
|
734
742
|
# --- Step 1: Extract number(s) to get a Celsius value expression ---
|
|
735
743
|
if self.average_mode:
|
|
736
744
|
# Extract all numbers and compute their mean. Polars' list.mean()
|
|
@@ -759,7 +767,7 @@ class TemperatureExtractor:
|
|
|
759
767
|
# --- Step 3: Round the result and return as a Series ---
|
|
760
768
|
# The select().to_series() pattern is a robust way to execute an
|
|
761
769
|
# expression and guarantee a Series is returned.
|
|
762
|
-
return pl.select(final_expr.round(2)).to_series()
|
|
770
|
+
return pl.select(final_expr.round(2)).to_series().alias(column_base_name)
|
|
763
771
|
|
|
764
772
|
|
|
765
773
|
class MultiTemperatureExtractor:
|
|
@@ -820,6 +828,7 @@ class MultiTemperatureExtractor:
|
|
|
820
828
|
"""
|
|
821
829
|
Applies the multi-temperature extraction and conversion logic.
|
|
822
830
|
"""
|
|
831
|
+
column_base_name = column.name
|
|
823
832
|
output_expressions = []
|
|
824
833
|
for i in range(self.num_outputs):
|
|
825
834
|
# --- Step 1: Extract the i-th number as a Celsius value ---
|
|
@@ -850,7 +859,7 @@ class MultiTemperatureExtractor:
|
|
|
850
859
|
pl.when(column.is_not_null())
|
|
851
860
|
.then(final_expr)
|
|
852
861
|
.otherwise(None)
|
|
853
|
-
.alias(f"
|
|
862
|
+
.alias(f"{column_base_name}_{i}") # Temporary name for DataProcessor
|
|
854
863
|
)
|
|
855
864
|
|
|
856
865
|
output_expressions.append(final_expr)
|
|
@@ -892,6 +901,7 @@ class RatioCalculator:
|
|
|
892
901
|
"""
|
|
893
902
|
Applies the ratio calculation logic to the input column. Uses .str.extract() for maximum stability and includes optional handling for zeros and single numbers.
|
|
894
903
|
"""
|
|
904
|
+
column_base_name = column.name
|
|
895
905
|
# Extract numerator (group 1) and denominator (group 2) separately.
|
|
896
906
|
numerator_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
|
|
897
907
|
denominator_expr = column.str.extract(self.regex_pattern, 2).cast(pl.Float64, strict=False)
|
|
@@ -929,7 +939,7 @@ class RatioCalculator:
|
|
|
929
939
|
else:
|
|
930
940
|
final_expr = ratio_expr
|
|
931
941
|
|
|
932
|
-
return pl.select(final_expr.round(4)).to_series()
|
|
942
|
+
return pl.select(final_expr.round(4)).to_series().alias(column_base_name)
|
|
933
943
|
|
|
934
944
|
|
|
935
945
|
class TriRatioCalculator:
|
|
@@ -970,6 +980,7 @@ class TriRatioCalculator:
|
|
|
970
980
|
"""
|
|
971
981
|
Applies the robust tri-ratio logic using the lazy API.
|
|
972
982
|
"""
|
|
983
|
+
column_base_name = column.name
|
|
973
984
|
# Wrap the input Series in a DataFrame to use the lazy expression API
|
|
974
985
|
temp_df = column.to_frame()
|
|
975
986
|
|
|
@@ -994,8 +1005,8 @@ class TriRatioCalculator:
|
|
|
994
1005
|
|
|
995
1006
|
# Execute the expressions and return the final DataFrame
|
|
996
1007
|
return temp_df.select(
|
|
997
|
-
|
|
998
|
-
|
|
1008
|
+
ratio_ab_expr.alias(f"{column_base_name}_A_to_B"),
|
|
1009
|
+
ratio_ac_expr.alias(f"{column_base_name}_A_to_C")
|
|
999
1010
|
)
|
|
1000
1011
|
|
|
1001
1012
|
|
|
@@ -1036,6 +1047,7 @@ class CategoryMapper:
|
|
|
1036
1047
|
Returns:
|
|
1037
1048
|
pl.Series: A new Series with categories mapped to numbers.
|
|
1038
1049
|
"""
|
|
1050
|
+
column_base_name = column.name
|
|
1039
1051
|
# Ensure the column is treated as a string for matching keys
|
|
1040
1052
|
str_column = column.cast(pl.Utf8)
|
|
1041
1053
|
|
|
@@ -1052,7 +1064,7 @@ class CategoryMapper:
|
|
|
1052
1064
|
pl.lit(self.default_value)
|
|
1053
1065
|
)
|
|
1054
1066
|
|
|
1055
|
-
return pl.select(final_expr).to_series()
|
|
1067
|
+
return pl.select(final_expr).to_series().alias(column_base_name)
|
|
1056
1068
|
|
|
1057
1069
|
|
|
1058
1070
|
class RegexMapper:
|
|
@@ -1116,6 +1128,7 @@ class RegexMapper:
|
|
|
1116
1128
|
pl.Series: A new Series with strings mapped to numbers based on
|
|
1117
1129
|
the first matching regex pattern.
|
|
1118
1130
|
"""
|
|
1131
|
+
column_base_name = column.name
|
|
1119
1132
|
# pl.String is the modern alias for pl.Utf8
|
|
1120
1133
|
str_column = column.cast(pl.String)
|
|
1121
1134
|
|
|
@@ -1130,7 +1143,7 @@ class RegexMapper:
|
|
|
1130
1143
|
.otherwise(mapping_expr)
|
|
1131
1144
|
)
|
|
1132
1145
|
|
|
1133
|
-
return pl.select(mapping_expr).to_series()
|
|
1146
|
+
return pl.select(mapping_expr).to_series().alias(column_base_name)
|
|
1134
1147
|
|
|
1135
1148
|
|
|
1136
1149
|
class ValueBinner:
|
|
@@ -1180,6 +1193,7 @@ class ValueBinner:
|
|
|
1180
1193
|
pl.Series: A new Series of integer labels for the bins. Values
|
|
1181
1194
|
outside the specified breaks will become null.
|
|
1182
1195
|
"""
|
|
1196
|
+
column_base_name = column.name
|
|
1183
1197
|
# `cut` creates a new column of type Categorical
|
|
1184
1198
|
binned_column = column.cut(
|
|
1185
1199
|
breaks=self.breaks,
|
|
@@ -1189,7 +1203,7 @@ class ValueBinner:
|
|
|
1189
1203
|
|
|
1190
1204
|
# to_physical() converts the Categorical type to its underlying
|
|
1191
1205
|
# integer representation (u32), which is perfect for ML.
|
|
1192
|
-
return binned_column.to_physical()
|
|
1206
|
+
return binned_column.to_physical().alias(column_base_name)
|
|
1193
1207
|
|
|
1194
1208
|
|
|
1195
1209
|
class DateFeatureExtractor:
|
|
@@ -1198,16 +1212,6 @@ class DateFeatureExtractor:
|
|
|
1198
1212
|
|
|
1199
1213
|
It can handle columns that are already in a Polars Date/Datetime format,
|
|
1200
1214
|
or it can parse string columns if a format is provided.
|
|
1201
|
-
|
|
1202
|
-
Args:
|
|
1203
|
-
features (List[str]):
|
|
1204
|
-
A list of the date/time features to extract. Supported features are:
|
|
1205
|
-
'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
|
|
1206
|
-
'microsecond', 'nanosecond', 'ordinal_day' (day of year),
|
|
1207
|
-
'weekday' (Mon=1, Sun=7), 'week' (week of year), and 'timestamp'.
|
|
1208
|
-
format (str | None):
|
|
1209
|
-
The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
|
|
1210
|
-
Use if the input column is not a Date or Datetime type.
|
|
1211
1215
|
"""
|
|
1212
1216
|
|
|
1213
1217
|
ALLOWED_FEATURES = {
|
|
@@ -1220,6 +1224,17 @@ class DateFeatureExtractor:
|
|
|
1220
1224
|
features: List[str],
|
|
1221
1225
|
format: Optional[str] = None,
|
|
1222
1226
|
):
|
|
1227
|
+
"""
|
|
1228
|
+
Args:
|
|
1229
|
+
features (List[str]):
|
|
1230
|
+
A list of the date/time features to extract. Supported features are:
|
|
1231
|
+
'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
|
|
1232
|
+
'microsecond', 'nanosecond', 'ordinal_day' (day of year),
|
|
1233
|
+
'weekday' (Mon=1, Sun=7), 'week' (week of year), 'timestamp'.
|
|
1234
|
+
format (str | None):
|
|
1235
|
+
The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
|
|
1236
|
+
Use if the input column is not a Date or Datetime type.
|
|
1237
|
+
"""
|
|
1223
1238
|
# --- Validation ---
|
|
1224
1239
|
if not isinstance(features, list) or not features:
|
|
1225
1240
|
_LOGGER.error("'features' must be a non-empty list of strings.")
|
|
@@ -1243,6 +1258,7 @@ class DateFeatureExtractor:
|
|
|
1243
1258
|
Returns:
|
|
1244
1259
|
pl.DataFrame: A DataFrame with columns for each extracted feature.
|
|
1245
1260
|
"""
|
|
1261
|
+
column_base_name = column.name
|
|
1246
1262
|
date_col = column
|
|
1247
1263
|
# First, parse strings into a datetime object if a format is given
|
|
1248
1264
|
if self.format is not None:
|
|
@@ -1258,7 +1274,7 @@ class DateFeatureExtractor:
|
|
|
1258
1274
|
expr = getattr(date_col.dt, feature)()
|
|
1259
1275
|
|
|
1260
1276
|
# Alias with a generic name for the processor to handle
|
|
1261
|
-
output_expressions.append(expr.alias(f"
|
|
1277
|
+
output_expressions.append(expr.alias(f"{column_base_name}_{feature}"))
|
|
1262
1278
|
|
|
1263
1279
|
return pl.select(output_expressions)
|
|
1264
1280
|
|
|
@@ -1275,20 +1291,10 @@ class MolecularFormulaTransformer:
|
|
|
1275
1291
|
It is designed to be used within the DataProcessor pipeline.
|
|
1276
1292
|
"""
|
|
1277
1293
|
|
|
1278
|
-
def __init__(self
|
|
1294
|
+
def __init__(self):
|
|
1279
1295
|
"""
|
|
1280
1296
|
Initializes the transformer and pre-compiles the regex pattern.
|
|
1281
|
-
|
|
1282
|
-
Args:
|
|
1283
|
-
prefix (str): The prefix for the output column names. Defaults to "Fraction".
|
|
1284
|
-
separator (str): The separator between the prefix and element symbol. Defaults to "_".
|
|
1285
1297
|
"""
|
|
1286
|
-
if not isinstance(prefix, str) or not isinstance(separator, str):
|
|
1287
|
-
_LOGGER.error("'prefix' and 'separator' must be strings.")
|
|
1288
|
-
raise TypeError()
|
|
1289
|
-
|
|
1290
|
-
self.prefix = prefix
|
|
1291
|
-
self.separator = separator
|
|
1292
1298
|
# Sort symbols by length to prevent matching 'C' in 'Co'
|
|
1293
1299
|
sorted_symbols = sorted(CHEMICAL_ELEMENT_SYMBOLS, key=len, reverse=True)
|
|
1294
1300
|
|
|
@@ -1305,6 +1311,7 @@ class MolecularFormulaTransformer:
|
|
|
1305
1311
|
Returns:
|
|
1306
1312
|
A Polars DataFrame with columns for every chemical element.
|
|
1307
1313
|
"""
|
|
1314
|
+
column_base_name = column.name
|
|
1308
1315
|
def parse_formula(formula: str) -> dict:
|
|
1309
1316
|
"""Helper to parse a single formula string into a dictionary."""
|
|
1310
1317
|
if not isinstance(formula, str) or not formula:
|
|
@@ -1328,7 +1335,7 @@ class MolecularFormulaTransformer:
|
|
|
1328
1335
|
# Ensure all possible element columns are created, filling with 0
|
|
1329
1336
|
select_expressions = []
|
|
1330
1337
|
for symbol in CHEMICAL_ELEMENT_SYMBOLS:
|
|
1331
|
-
col_name = f"{
|
|
1338
|
+
col_name = f"{column_base_name}_{symbol}"
|
|
1332
1339
|
if symbol in df.columns:
|
|
1333
1340
|
expr = pl.col(symbol).fill_null(0).alias(col_name)
|
|
1334
1341
|
else:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/dragon_ml_toolbox.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/dragon_ml_toolbox.egg-info/requires.txt
RENAMED
|
File without changes
|
{dragon_ml_toolbox-11.0.0 → dragon_ml_toolbox-11.1.1}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|