dragon-ml-toolbox 10.2.1__py3-none-any.whl → 10.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.4.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.4.0.dist-info}/RECORD +8 -8
- ml_tools/ETL_engineering.py +326 -14
- ml_tools/path_manager.py +1 -1
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.4.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.4.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.4.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.4.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
dragon_ml_toolbox-10.
|
|
2
|
-
dragon_ml_toolbox-10.
|
|
1
|
+
dragon_ml_toolbox-10.4.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-10.4.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
3
|
ml_tools/ETL_cleaning.py,sha256=lSP5q6-ukGhJBPV8dlsqJvPXAzj4du_0J-SbtEd0Pjg,19292
|
|
4
|
-
ml_tools/ETL_engineering.py,sha256=
|
|
4
|
+
ml_tools/ETL_engineering.py,sha256=LuOEIc2TMR6c1eXhh4-Budj0mTtZyrzE4yU7HPYxDds,49207
|
|
5
5
|
ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
|
|
6
6
|
ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
|
|
7
7
|
ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
|
|
@@ -28,9 +28,9 @@ ml_tools/ensemble_learning.py,sha256=3s0kH4i_naj0IVl_T4knst-Hwg4TScWjEdsXX5KAi7I
|
|
|
28
28
|
ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,14005
|
|
29
29
|
ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
|
|
30
30
|
ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
|
|
31
|
-
ml_tools/path_manager.py,sha256=
|
|
31
|
+
ml_tools/path_manager.py,sha256=7sRvAoNrboRY6ef9gH3_qdzoZ66iLs7Aii4P39K0kEk,13819
|
|
32
32
|
ml_tools/utilities.py,sha256=SVMaSDigh6SUoAeig2_sXLLIj5w5mUs5KuVWpHvFDec,19816
|
|
33
|
-
dragon_ml_toolbox-10.
|
|
34
|
-
dragon_ml_toolbox-10.
|
|
35
|
-
dragon_ml_toolbox-10.
|
|
36
|
-
dragon_ml_toolbox-10.
|
|
33
|
+
dragon_ml_toolbox-10.4.0.dist-info/METADATA,sha256=wD3UR2VUKd0JbxZF5r6HG2_8tdwZIfP6deIQZ8Y0mDs,6968
|
|
34
|
+
dragon_ml_toolbox-10.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
+
dragon_ml_toolbox-10.4.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
36
|
+
dragon_ml_toolbox-10.4.0.dist-info/RECORD,,
|
ml_tools/ETL_engineering.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import polars as pl
|
|
2
2
|
import re
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
|
|
5
|
+
from .utilities import load_dataframe, save_dataframe
|
|
6
|
+
from .path_manager import make_fullpath
|
|
4
7
|
from ._script_info import _script_info
|
|
5
8
|
from ._logger import _LOGGER
|
|
6
9
|
|
|
@@ -14,7 +17,10 @@ __all__ = [
|
|
|
14
17
|
"KeywordDummifier",
|
|
15
18
|
"NumberExtractor",
|
|
16
19
|
"MultiNumberExtractor",
|
|
20
|
+
"TemperatureExtractor",
|
|
21
|
+
"MultiTemperatureExtractor",
|
|
17
22
|
"RatioCalculator",
|
|
23
|
+
"TriRatioCalculator",
|
|
18
24
|
"CategoryMapper",
|
|
19
25
|
"RegexMapper",
|
|
20
26
|
"ValueBinner",
|
|
@@ -182,9 +188,28 @@ class DataProcessor:
|
|
|
182
188
|
if not processed_columns:
|
|
183
189
|
_LOGGER.error("The transformation resulted in an empty DataFrame.")
|
|
184
190
|
return pl.DataFrame()
|
|
185
|
-
|
|
191
|
+
|
|
192
|
+
_LOGGER.info(f"Processed dataframe with {len(processed_columns)} columns.")
|
|
193
|
+
|
|
186
194
|
return pl.DataFrame(processed_columns)
|
|
187
195
|
|
|
196
|
+
def load_transform_save(self, input_path: Union[str,Path], output_path: Union[str,Path]):
|
|
197
|
+
"""
|
|
198
|
+
Convenience wrapper for the transform method that includes automatic dataframe loading and saving.
|
|
199
|
+
"""
|
|
200
|
+
# Validate paths
|
|
201
|
+
in_path = make_fullpath(input_path, enforce="file")
|
|
202
|
+
out_path = make_fullpath(output_path, make=True, enforce="file")
|
|
203
|
+
|
|
204
|
+
# load df
|
|
205
|
+
df, _ = load_dataframe(df_path=in_path, kind="polars", all_strings=True)
|
|
206
|
+
|
|
207
|
+
# Process
|
|
208
|
+
df_processed = self.transform(df)
|
|
209
|
+
|
|
210
|
+
# save processed df
|
|
211
|
+
save_dataframe(df=df_processed, save_dir=out_path.parent, filename=out_path.name)
|
|
212
|
+
|
|
188
213
|
def __str__(self) -> str:
|
|
189
214
|
"""
|
|
190
215
|
Provides a detailed, human-readable string representation of the
|
|
@@ -302,6 +327,15 @@ class AutoDummifier:
|
|
|
302
327
|
A transformer that performs one-hot encoding on a categorical column,
|
|
303
328
|
automatically detecting the unique categories from the data.
|
|
304
329
|
"""
|
|
330
|
+
def __init__(self, drop_first: bool = False):
|
|
331
|
+
"""
|
|
332
|
+
Initializes the AutoDummifier.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
drop_first (bool): If True, drops the first dummy column.
|
|
336
|
+
"""
|
|
337
|
+
self.drop_first = drop_first
|
|
338
|
+
|
|
305
339
|
def __call__(self, column: pl.Series) -> pl.DataFrame:
|
|
306
340
|
"""
|
|
307
341
|
Executes the one-hot encoding logic.
|
|
@@ -315,7 +349,7 @@ class AutoDummifier:
|
|
|
315
349
|
'{original_col_name}_{category_value}'.
|
|
316
350
|
"""
|
|
317
351
|
# Ensure the column is treated as a string before creating dummies
|
|
318
|
-
return column.cast(pl.Utf8).to_dummies()
|
|
352
|
+
return column.cast(pl.Utf8).to_dummies(drop_first=self.drop_first)
|
|
319
353
|
|
|
320
354
|
|
|
321
355
|
class MultiBinaryDummifier:
|
|
@@ -617,16 +651,200 @@ class MultiNumberExtractor:
|
|
|
617
651
|
return pl.select(output_expressions)
|
|
618
652
|
|
|
619
653
|
|
|
654
|
+
class TemperatureExtractor:
|
|
655
|
+
"""
|
|
656
|
+
Extracts temperature values from a string column.
|
|
657
|
+
|
|
658
|
+
This transformer assumes that the source temperature values are in Celsius.
|
|
659
|
+
It can extract a single value using a specific regex or find all numbers in
|
|
660
|
+
a string and calculate their average. It also supports converting the final
|
|
661
|
+
Celsius value to Kelvin or Rankine.
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
regex_pattern (str):
|
|
665
|
+
The regex to find a single temperature. MUST contain exactly one
|
|
666
|
+
capturing group `(...)`. This is ignored if `average_mode` is True.
|
|
667
|
+
average_mode (bool):
|
|
668
|
+
If True, extracts all numbers from the string and returns their average.
|
|
669
|
+
This overrides the `regex_pattern` with a generic number-finding regex.
|
|
670
|
+
convert (str | None):
|
|
671
|
+
If "K", converts the final Celsius value to Kelvin.
|
|
672
|
+
If "R", converts the final Celsius value to Rankine.
|
|
673
|
+
If None (default), the value remains in Celsius.
|
|
674
|
+
"""
|
|
675
|
+
def __init__(
|
|
676
|
+
self,
|
|
677
|
+
regex_pattern: str = r"(\d+\.?\d*)",
|
|
678
|
+
average_mode: bool = False,
|
|
679
|
+
convert: Optional[Literal["K", "R"]] = None,
|
|
680
|
+
):
|
|
681
|
+
# --- Store configuration ---
|
|
682
|
+
self.average_mode = average_mode
|
|
683
|
+
self.convert = convert
|
|
684
|
+
self.regex_pattern = regex_pattern
|
|
685
|
+
|
|
686
|
+
# Generic pattern for average mode, defined once for efficiency.
|
|
687
|
+
self._avg_mode_pattern = r"(\d+\.?\d*)"
|
|
688
|
+
|
|
689
|
+
# --- Validation ---
|
|
690
|
+
if not self.average_mode:
|
|
691
|
+
try:
|
|
692
|
+
if re.compile(self.regex_pattern).groups != 1:
|
|
693
|
+
_LOGGER.error("'regex_pattern' must contain exactly one capturing group '(...)' for single extraction mode.")
|
|
694
|
+
raise ValueError()
|
|
695
|
+
except re.error as e:
|
|
696
|
+
_LOGGER.error(f"Invalid regex pattern provided: {e}")
|
|
697
|
+
raise ValueError()
|
|
698
|
+
|
|
699
|
+
if self.convert is not None and self.convert not in ["K", "R"]:
|
|
700
|
+
_LOGGER.error("'convert' must be either 'K' (Kelvin) or 'R' (Rankine).")
|
|
701
|
+
raise ValueError()
|
|
702
|
+
|
|
703
|
+
def __call__(self, column: pl.Series) -> pl.Series:
|
|
704
|
+
"""
|
|
705
|
+
Applies the temperature extraction and conversion logic.
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
column (pl.Series): The input Polars Series with string data.
|
|
709
|
+
|
|
710
|
+
Returns:
|
|
711
|
+
pl.Series: A new Series containing the final temperature values as floats.
|
|
712
|
+
"""
|
|
713
|
+
# --- Step 1: Extract number(s) to get a Celsius value expression ---
|
|
714
|
+
if self.average_mode:
|
|
715
|
+
# Extract all numbers and compute their mean. Polars' list.mean()
|
|
716
|
+
# handles the casting to float automatically.
|
|
717
|
+
celsius_expr = column.str.extract_all(self._avg_mode_pattern).list.mean()
|
|
718
|
+
else:
|
|
719
|
+
# Extract a single number using the specified pattern.
|
|
720
|
+
# Cast to Float64, with non-matches becoming null.
|
|
721
|
+
celsius_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
|
|
722
|
+
|
|
723
|
+
# --- Step 2: Apply conversion if specified ---
|
|
724
|
+
if self.convert == "K":
|
|
725
|
+
# Celsius to Kelvin: C + 273.15
|
|
726
|
+
final_expr = celsius_expr + 273.15
|
|
727
|
+
elif self.convert == "R":
|
|
728
|
+
# Celsius to Rankine: (C * 9/5) + 491.67
|
|
729
|
+
final_expr = (celsius_expr * 1.8) + 491.67
|
|
730
|
+
else:
|
|
731
|
+
# No conversion needed
|
|
732
|
+
final_expr = celsius_expr
|
|
733
|
+
|
|
734
|
+
# --- Step 3: Round the result and return as a Series ---
|
|
735
|
+
# The select().to_series() pattern is a robust way to execute an
|
|
736
|
+
# expression and guarantee a Series is returned.
|
|
737
|
+
return pl.select(final_expr.round(2)).to_series()
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
class MultiTemperatureExtractor:
|
|
741
|
+
"""
|
|
742
|
+
Extracts multiple temperature values from a single string column into
|
|
743
|
+
several new columns, assuming the source values are in Celsius.
|
|
744
|
+
|
|
745
|
+
This one-to-many transformer is designed for cases where multiple readings
|
|
746
|
+
are packed into one field, like "Min: 10C, Max: 25C".
|
|
747
|
+
|
|
748
|
+
Args:
|
|
749
|
+
num_outputs (int):
|
|
750
|
+
The number of numeric columns to create.
|
|
751
|
+
regex_pattern (str):
|
|
752
|
+
The regex to find all numbers. Must contain exactly one capturing
|
|
753
|
+
group around the number part (e.g., r"(-?\\d+\\.?\\d*)").
|
|
754
|
+
convert (str | None):
|
|
755
|
+
If "K", converts the final Celsius values to Kelvin.
|
|
756
|
+
If "R", converts the final Celsius values to Rankine.
|
|
757
|
+
If None (default), the values remain in Celsius.
|
|
758
|
+
fill_value (int | float | None):
|
|
759
|
+
A value to use if a temperature is not found at a given position.
|
|
760
|
+
For example, if `num_outputs=3` and only two temperatures are
|
|
761
|
+
found, the third column will be filled with this value. If None,
|
|
762
|
+
it will be filled with null.
|
|
763
|
+
"""
|
|
764
|
+
def __init__(
|
|
765
|
+
self,
|
|
766
|
+
num_outputs: int,
|
|
767
|
+
regex_pattern: str = r"(\d+\.?\d*)",
|
|
768
|
+
convert: Optional[Literal["K", "R"]] = None,
|
|
769
|
+
fill_value: Optional[Union[int, float]] = None
|
|
770
|
+
):
|
|
771
|
+
# --- Validation ---
|
|
772
|
+
if not isinstance(num_outputs, int) or num_outputs <= 0:
|
|
773
|
+
_LOGGER.error("'num_outputs' must be a positive integer.")
|
|
774
|
+
raise ValueError()
|
|
775
|
+
|
|
776
|
+
try:
|
|
777
|
+
if re.compile(regex_pattern).groups != 1:
|
|
778
|
+
_LOGGER.error("'regex_pattern' must contain exactly one capturing group '(...)'.")
|
|
779
|
+
raise ValueError()
|
|
780
|
+
except re.error as e:
|
|
781
|
+
_LOGGER.error(f"Invalid regex pattern provided: {e}")
|
|
782
|
+
raise ValueError()
|
|
783
|
+
|
|
784
|
+
if convert is not None and convert not in ["K", "R"]:
|
|
785
|
+
_LOGGER.error("'convert' must be either 'K' (Kelvin) or 'R' (Rankine).")
|
|
786
|
+
raise ValueError()
|
|
787
|
+
|
|
788
|
+
# --- Store configuration ---
|
|
789
|
+
self.num_outputs = num_outputs
|
|
790
|
+
self.regex_pattern = regex_pattern
|
|
791
|
+
self.convert = convert
|
|
792
|
+
self.fill_value = fill_value
|
|
793
|
+
|
|
794
|
+
def __call__(self, column: pl.Series) -> pl.DataFrame:
|
|
795
|
+
"""
|
|
796
|
+
Applies the multi-temperature extraction and conversion logic.
|
|
797
|
+
"""
|
|
798
|
+
output_expressions = []
|
|
799
|
+
for i in range(self.num_outputs):
|
|
800
|
+
# --- Step 1: Extract the i-th number as a Celsius value ---
|
|
801
|
+
celsius_expr = (
|
|
802
|
+
column.str.extract_all(self.regex_pattern)
|
|
803
|
+
.list.get(i)
|
|
804
|
+
.cast(pl.Float64, strict=False)
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
# --- Step 2: Apply conversion if specified ---
|
|
808
|
+
if self.convert == "K":
|
|
809
|
+
# Celsius to Kelvin: C + 273.15
|
|
810
|
+
converted_expr = celsius_expr + 273.15
|
|
811
|
+
elif self.convert == "R":
|
|
812
|
+
# Celsius to Rankine: (C * 9/5) + 491.67
|
|
813
|
+
converted_expr = (celsius_expr * 1.8) + 491.67
|
|
814
|
+
else:
|
|
815
|
+
# No conversion needed
|
|
816
|
+
converted_expr = celsius_expr
|
|
817
|
+
|
|
818
|
+
# --- Step 3: Apply fill value and handle original nulls ---
|
|
819
|
+
final_expr = converted_expr.round(2)
|
|
820
|
+
if self.fill_value is not None:
|
|
821
|
+
final_expr = final_expr.fill_null(self.fill_value)
|
|
822
|
+
|
|
823
|
+
# Ensure that if the original row was null, all outputs are null
|
|
824
|
+
final_expr = (
|
|
825
|
+
pl.when(column.is_not_null())
|
|
826
|
+
.then(final_expr)
|
|
827
|
+
.otherwise(None)
|
|
828
|
+
.alias(f"col_{i}") # Temporary name for DataProcessor
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
output_expressions.append(final_expr)
|
|
832
|
+
|
|
833
|
+
# Execute all expressions at once for performance
|
|
834
|
+
return pl.select(output_expressions)
|
|
835
|
+
|
|
836
|
+
|
|
620
837
|
class RatioCalculator:
|
|
621
838
|
"""
|
|
622
839
|
A transformer that parses a string ratio (e.g., "40:5" or "30/2") and
|
|
623
|
-
computes the result of the division.
|
|
624
|
-
|
|
840
|
+
computes the result of the division. Includes robust handling for
|
|
841
|
+
zeros and single numbers.
|
|
625
842
|
"""
|
|
626
843
|
def __init__(
|
|
627
844
|
self,
|
|
628
|
-
|
|
629
|
-
|
|
845
|
+
regex_pattern: str = r"(\d+\.?\d*)\s*[::/]\s*(\d+\.?\d*)",
|
|
846
|
+
handle_zeros: bool = False,
|
|
847
|
+
handle_single_number: bool = False
|
|
630
848
|
):
|
|
631
849
|
# --- Robust Validation ---
|
|
632
850
|
try:
|
|
@@ -642,26 +860,120 @@ class RatioCalculator:
|
|
|
642
860
|
raise ValueError()
|
|
643
861
|
|
|
644
862
|
self.regex_pattern = regex_pattern
|
|
863
|
+
self.handle_zeros = handle_zeros
|
|
864
|
+
self.handle_single_number = handle_single_number
|
|
645
865
|
|
|
646
866
|
def __call__(self, column: pl.Series) -> pl.Series:
|
|
647
867
|
"""
|
|
648
|
-
Applies the ratio calculation logic to the input column.
|
|
649
|
-
This version uses .str.extract() for maximum stability.
|
|
868
|
+
Applies the ratio calculation logic to the input column. Uses .str.extract() for maximum stability and includes optional handling for zeros and single numbers.
|
|
650
869
|
"""
|
|
651
870
|
# Extract numerator (group 1) and denominator (group 2) separately.
|
|
652
871
|
numerator_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
|
|
653
872
|
denominator_expr = column.str.extract(self.regex_pattern, 2).cast(pl.Float64, strict=False)
|
|
654
873
|
|
|
655
|
-
#
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
874
|
+
# --- Logic for Requirement A: Special zero handling ---
|
|
875
|
+
if self.handle_zeros:
|
|
876
|
+
ratio_expr = (
|
|
877
|
+
pl.when(numerator_expr.is_not_null() & denominator_expr.is_not_null())
|
|
878
|
+
.then(
|
|
879
|
+
pl.when((numerator_expr == 0) & (denominator_expr == 0)).then(pl.lit(0.0))
|
|
880
|
+
.when((numerator_expr != 0) & (denominator_expr == 0)).then(numerator_expr)
|
|
881
|
+
.when((numerator_expr == 0) & (denominator_expr != 0)).then(denominator_expr)
|
|
882
|
+
.otherwise(numerator_expr / denominator_expr) # Default: both are non-zero
|
|
883
|
+
)
|
|
884
|
+
)
|
|
885
|
+
else:
|
|
886
|
+
# Original logic
|
|
887
|
+
ratio_expr = pl.when(denominator_expr != 0).then(
|
|
888
|
+
numerator_expr / denominator_expr
|
|
889
|
+
).otherwise(
|
|
890
|
+
None # Handles null denominators and division by zero
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
# --- Logic for Requirement B: Handle single numbers as a fallback ---
|
|
894
|
+
if self.handle_single_number:
|
|
895
|
+
# Regex to match a string that is ONLY a valid float/int
|
|
896
|
+
single_number_regex = r"^\d+\.?\d*$"
|
|
897
|
+
single_number_expr = (
|
|
898
|
+
pl.when(column.str.contains(single_number_regex))
|
|
899
|
+
.then(column.cast(pl.Float64, strict=False))
|
|
900
|
+
.otherwise(None)
|
|
901
|
+
)
|
|
902
|
+
# If ratio_expr is null, try to fill it with single_number_expr
|
|
903
|
+
final_expr = ratio_expr.fill_null(single_number_expr)
|
|
904
|
+
else:
|
|
905
|
+
final_expr = ratio_expr
|
|
661
906
|
|
|
662
907
|
return pl.select(final_expr.round(4)).to_series()
|
|
663
908
|
|
|
664
909
|
|
|
910
|
+
class TriRatioCalculator:
|
|
911
|
+
"""
|
|
912
|
+
A transformer that handles three-part ("A:B:C") and two-part ("A:C")
|
|
913
|
+
ratios, enforcing a strict output structure.
|
|
914
|
+
|
|
915
|
+
- Three-part ratios produce A/B and A/C.
|
|
916
|
+
- Two-part ratios are assumed to be A:C and produce None for A/B.
|
|
917
|
+
- Single values produce None for both outputs.
|
|
918
|
+
"""
|
|
919
|
+
def __init__(self, handle_zeros: bool = False):
|
|
920
|
+
"""
|
|
921
|
+
Initializes the TriRatioCalculator.
|
|
922
|
+
|
|
923
|
+
Args:
|
|
924
|
+
handle_zeros (bool): If True, returns a valid value if either the denominator or numerator is zero; returns zero if both are zero.
|
|
925
|
+
"""
|
|
926
|
+
self.handle_zeros = handle_zeros
|
|
927
|
+
|
|
928
|
+
def _calculate_ratio(self, num: pl.Expr, den: pl.Expr) -> pl.Expr:
|
|
929
|
+
"""Helper to contain the core division logic."""
|
|
930
|
+
if self.handle_zeros:
|
|
931
|
+
# Special handling for zeros
|
|
932
|
+
expr = (
|
|
933
|
+
pl.when((num == 0) & (den == 0)).then(pl.lit(0.0))
|
|
934
|
+
.when((num != 0) & (den == 0)).then(num) # Return numerator
|
|
935
|
+
.when((num == 0) & (den != 0)).then(den) # Return denominator
|
|
936
|
+
.otherwise(num / den)
|
|
937
|
+
)
|
|
938
|
+
else:
|
|
939
|
+
# Default behavior: return null if denominator is 0
|
|
940
|
+
expr = pl.when(den != 0).then(num / den).otherwise(None)
|
|
941
|
+
|
|
942
|
+
return expr.round(4)
|
|
943
|
+
|
|
944
|
+
def __call__(self, column: pl.Series) -> pl.DataFrame:
|
|
945
|
+
"""
|
|
946
|
+
Applies the robust tri-ratio logic using the lazy API.
|
|
947
|
+
"""
|
|
948
|
+
# Wrap the input Series in a DataFrame to use the lazy expression API
|
|
949
|
+
temp_df = column.to_frame()
|
|
950
|
+
|
|
951
|
+
# Define all steps as lazy expressions
|
|
952
|
+
all_numbers_expr = pl.col(column.name).str.extract_all(r"(\d+\.?\d*)")
|
|
953
|
+
num_parts_expr = all_numbers_expr.list.len()
|
|
954
|
+
|
|
955
|
+
expr_A = all_numbers_expr.list.get(0).cast(pl.Float64)
|
|
956
|
+
expr_B = all_numbers_expr.list.get(1).cast(pl.Float64)
|
|
957
|
+
expr_C = all_numbers_expr.list.get(2).cast(pl.Float64)
|
|
958
|
+
|
|
959
|
+
# Define logic for each output column using expressions
|
|
960
|
+
ratio_ab_expr = pl.when(num_parts_expr == 3).then(
|
|
961
|
+
self._calculate_ratio(expr_A, expr_B)
|
|
962
|
+
).otherwise(None)
|
|
963
|
+
|
|
964
|
+
ratio_ac_expr = pl.when(num_parts_expr == 3).then(
|
|
965
|
+
self._calculate_ratio(expr_A, expr_C)
|
|
966
|
+
).when(num_parts_expr == 2).then(
|
|
967
|
+
self._calculate_ratio(expr_A, expr_B) # B is actually C in this case
|
|
968
|
+
).otherwise(None)
|
|
969
|
+
|
|
970
|
+
# Execute the expressions and return the final DataFrame
|
|
971
|
+
return temp_df.select(
|
|
972
|
+
A_div_B=ratio_ab_expr,
|
|
973
|
+
A_div_C=ratio_ac_expr
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
|
|
665
977
|
class CategoryMapper:
|
|
666
978
|
"""
|
|
667
979
|
A transformer that maps string categories to specified numerical values using a dictionary.
|
ml_tools/path_manager.py
CHANGED
|
@@ -20,7 +20,7 @@ class PathManager:
|
|
|
20
20
|
"""
|
|
21
21
|
Manages and stores a project's file paths, acting as a centralized
|
|
22
22
|
"path database". It supports both development mode and applications
|
|
23
|
-
bundled with Pyinstaller.
|
|
23
|
+
bundled with Pyinstaller or Nuitka.
|
|
24
24
|
|
|
25
25
|
Supports python dictionary syntax.
|
|
26
26
|
"""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|