dragon-ml-toolbox 10.2.1__py3-none-any.whl → 10.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.3.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.3.0.dist-info}/RECORD +7 -7
- ml_tools/ETL_engineering.py +130 -13
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.3.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.3.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.3.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-10.2.1.dist-info → dragon_ml_toolbox-10.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
dragon_ml_toolbox-10.
|
|
2
|
-
dragon_ml_toolbox-10.
|
|
1
|
+
dragon_ml_toolbox-10.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-10.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
3
|
ml_tools/ETL_cleaning.py,sha256=lSP5q6-ukGhJBPV8dlsqJvPXAzj4du_0J-SbtEd0Pjg,19292
|
|
4
|
-
ml_tools/ETL_engineering.py,sha256=
|
|
4
|
+
ml_tools/ETL_engineering.py,sha256=SllDugNGhvJWR4OBKnklxYNcYDvf3nR7U8Dkrf9bXbc,41271
|
|
5
5
|
ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
|
|
6
6
|
ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
|
|
7
7
|
ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
|
|
@@ -30,7 +30,7 @@ ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
|
|
|
30
30
|
ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
|
|
31
31
|
ml_tools/path_manager.py,sha256=TJgoqMAryc5F0dal8W_zvJgE1TpOzlskIyYJk614WW4,13809
|
|
32
32
|
ml_tools/utilities.py,sha256=SVMaSDigh6SUoAeig2_sXLLIj5w5mUs5KuVWpHvFDec,19816
|
|
33
|
-
dragon_ml_toolbox-10.
|
|
34
|
-
dragon_ml_toolbox-10.
|
|
35
|
-
dragon_ml_toolbox-10.
|
|
36
|
-
dragon_ml_toolbox-10.
|
|
33
|
+
dragon_ml_toolbox-10.3.0.dist-info/METADATA,sha256=cN1FrJIq3Iul289UqHNOnhVrn1QT0V62rnQJFz5V374,6968
|
|
34
|
+
dragon_ml_toolbox-10.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
+
dragon_ml_toolbox-10.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
36
|
+
dragon_ml_toolbox-10.3.0.dist-info/RECORD,,
|
ml_tools/ETL_engineering.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import polars as pl
|
|
2
2
|
import re
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
|
|
5
|
+
from .utilities import load_dataframe, save_dataframe
|
|
6
|
+
from .path_manager import make_fullpath
|
|
4
7
|
from ._script_info import _script_info
|
|
5
8
|
from ._logger import _LOGGER
|
|
6
9
|
|
|
@@ -182,9 +185,28 @@ class DataProcessor:
|
|
|
182
185
|
if not processed_columns:
|
|
183
186
|
_LOGGER.error("The transformation resulted in an empty DataFrame.")
|
|
184
187
|
return pl.DataFrame()
|
|
185
|
-
|
|
188
|
+
|
|
189
|
+
_LOGGER.info(f"Processed dataframe with {len(processed_columns)} columns.")
|
|
190
|
+
|
|
186
191
|
return pl.DataFrame(processed_columns)
|
|
187
192
|
|
|
193
|
+
def load_transform_save(self, input_path: Union[str,Path], output_path: Union[str,Path]):
|
|
194
|
+
"""
|
|
195
|
+
Convenience wrapper for the transform method that includes automatic dataframe loading and saving.
|
|
196
|
+
"""
|
|
197
|
+
# Validate paths
|
|
198
|
+
in_path = make_fullpath(input_path, enforce="file")
|
|
199
|
+
out_path = make_fullpath(output_path, make=True, enforce="file")
|
|
200
|
+
|
|
201
|
+
# load df
|
|
202
|
+
df, _ = load_dataframe(df_path=in_path, kind="polars", all_strings=True)
|
|
203
|
+
|
|
204
|
+
# Process
|
|
205
|
+
df_processed = self.transform(df)
|
|
206
|
+
|
|
207
|
+
# save processed df
|
|
208
|
+
save_dataframe(df=df_processed, save_dir=out_path.parent, filename=out_path.name)
|
|
209
|
+
|
|
188
210
|
def __str__(self) -> str:
|
|
189
211
|
"""
|
|
190
212
|
Provides a detailed, human-readable string representation of the
|
|
@@ -620,13 +642,14 @@ class MultiNumberExtractor:
|
|
|
620
642
|
class RatioCalculator:
|
|
621
643
|
"""
|
|
622
644
|
A transformer that parses a string ratio (e.g., "40:5" or "30/2") and
|
|
623
|
-
computes the result of the division.
|
|
624
|
-
|
|
645
|
+
computes the result of the division. Includes robust handling for
|
|
646
|
+
zeros and single numbers.
|
|
625
647
|
"""
|
|
626
648
|
def __init__(
|
|
627
649
|
self,
|
|
628
|
-
|
|
629
|
-
|
|
650
|
+
regex_pattern: str = r"(\d+\.?\d*)\s*[::/]\s*(\d+\.?\d*)",
|
|
651
|
+
handle_zeros: bool = False,
|
|
652
|
+
handle_single_number: bool = False
|
|
630
653
|
):
|
|
631
654
|
# --- Robust Validation ---
|
|
632
655
|
try:
|
|
@@ -642,26 +665,120 @@ class RatioCalculator:
|
|
|
642
665
|
raise ValueError()
|
|
643
666
|
|
|
644
667
|
self.regex_pattern = regex_pattern
|
|
668
|
+
self.handle_zeros = handle_zeros
|
|
669
|
+
self.handle_single_number = handle_single_number
|
|
645
670
|
|
|
646
671
|
def __call__(self, column: pl.Series) -> pl.Series:
|
|
647
672
|
"""
|
|
648
|
-
Applies the ratio calculation logic to the input column.
|
|
649
|
-
This version uses .str.extract() for maximum stability.
|
|
673
|
+
Applies the ratio calculation logic to the input column. Uses .str.extract() for maximum stability and includes optional handling for zeros and single numbers.
|
|
650
674
|
"""
|
|
651
675
|
# Extract numerator (group 1) and denominator (group 2) separately.
|
|
652
676
|
numerator_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
|
|
653
677
|
denominator_expr = column.str.extract(self.regex_pattern, 2).cast(pl.Float64, strict=False)
|
|
654
678
|
|
|
655
|
-
#
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
679
|
+
# --- Logic for Requirement A: Special zero handling ---
|
|
680
|
+
if self.handle_zeros:
|
|
681
|
+
ratio_expr = (
|
|
682
|
+
pl.when(numerator_expr.is_not_null() & denominator_expr.is_not_null())
|
|
683
|
+
.then(
|
|
684
|
+
pl.when((numerator_expr == 0) & (denominator_expr == 0)).then(pl.lit(0.0))
|
|
685
|
+
.when((numerator_expr != 0) & (denominator_expr == 0)).then(numerator_expr)
|
|
686
|
+
.when((numerator_expr == 0) & (denominator_expr != 0)).then(denominator_expr)
|
|
687
|
+
.otherwise(numerator_expr / denominator_expr) # Default: both are non-zero
|
|
688
|
+
)
|
|
689
|
+
)
|
|
690
|
+
else:
|
|
691
|
+
# Original logic
|
|
692
|
+
ratio_expr = pl.when(denominator_expr != 0).then(
|
|
693
|
+
numerator_expr / denominator_expr
|
|
694
|
+
).otherwise(
|
|
695
|
+
None # Handles null denominators and division by zero
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
# --- Logic for Requirement B: Handle single numbers as a fallback ---
|
|
699
|
+
if self.handle_single_number:
|
|
700
|
+
# Regex to match a string that is ONLY a valid float/int
|
|
701
|
+
single_number_regex = r"^\d+\.?\d*$"
|
|
702
|
+
single_number_expr = (
|
|
703
|
+
pl.when(column.str.contains(single_number_regex))
|
|
704
|
+
.then(column.cast(pl.Float64, strict=False))
|
|
705
|
+
.otherwise(None)
|
|
706
|
+
)
|
|
707
|
+
# If ratio_expr is null, try to fill it with single_number_expr
|
|
708
|
+
final_expr = ratio_expr.fill_null(single_number_expr)
|
|
709
|
+
else:
|
|
710
|
+
final_expr = ratio_expr
|
|
661
711
|
|
|
662
712
|
return pl.select(final_expr.round(4)).to_series()
|
|
663
713
|
|
|
664
714
|
|
|
715
|
+
class TriRatioCalculator:
|
|
716
|
+
"""
|
|
717
|
+
A transformer that handles three-part ("A:B:C") and two-part ("A:C")
|
|
718
|
+
ratios, enforcing a strict output structure.
|
|
719
|
+
|
|
720
|
+
- Three-part ratios produce A/B and A/C.
|
|
721
|
+
- Two-part ratios are assumed to be A:C and produce None for A/B.
|
|
722
|
+
- Single values produce None for both outputs.
|
|
723
|
+
"""
|
|
724
|
+
def __init__(self, handle_zeros: bool = False):
|
|
725
|
+
"""
|
|
726
|
+
Initializes the TriRatioCalculator.
|
|
727
|
+
|
|
728
|
+
Args:
|
|
729
|
+
handle_zeros (bool): If True, returns a valid value if either the denominator or numerator is zero; returns zero if both are zero.
|
|
730
|
+
"""
|
|
731
|
+
self.handle_zeros = handle_zeros
|
|
732
|
+
|
|
733
|
+
def _calculate_ratio(self, num: pl.Expr, den: pl.Expr) -> pl.Expr:
|
|
734
|
+
"""Helper to contain the core division logic."""
|
|
735
|
+
if self.handle_zeros:
|
|
736
|
+
# Special handling for zeros
|
|
737
|
+
expr = (
|
|
738
|
+
pl.when((num == 0) & (den == 0)).then(pl.lit(0.0))
|
|
739
|
+
.when((num != 0) & (den == 0)).then(num) # Return numerator
|
|
740
|
+
.when((num == 0) & (den != 0)).then(den) # Return denominator
|
|
741
|
+
.otherwise(num / den)
|
|
742
|
+
)
|
|
743
|
+
else:
|
|
744
|
+
# Default behavior: return null if denominator is 0
|
|
745
|
+
expr = pl.when(den != 0).then(num / den).otherwise(None)
|
|
746
|
+
|
|
747
|
+
return expr.round(4)
|
|
748
|
+
|
|
749
|
+
def __call__(self, column: pl.Series) -> pl.DataFrame:
|
|
750
|
+
"""
|
|
751
|
+
Applies the robust tri-ratio logic using the lazy API.
|
|
752
|
+
"""
|
|
753
|
+
# Wrap the input Series in a DataFrame to use the lazy expression API
|
|
754
|
+
temp_df = column.to_frame()
|
|
755
|
+
|
|
756
|
+
# Define all steps as lazy expressions
|
|
757
|
+
all_numbers_expr = pl.col(column.name).str.extract_all(r"(\d+\.?\d*)")
|
|
758
|
+
num_parts_expr = all_numbers_expr.list.len()
|
|
759
|
+
|
|
760
|
+
expr_A = all_numbers_expr.list.get(0).cast(pl.Float64)
|
|
761
|
+
expr_B = all_numbers_expr.list.get(1).cast(pl.Float64)
|
|
762
|
+
expr_C = all_numbers_expr.list.get(2).cast(pl.Float64)
|
|
763
|
+
|
|
764
|
+
# Define logic for each output column using expressions
|
|
765
|
+
ratio_ab_expr = pl.when(num_parts_expr == 3).then(
|
|
766
|
+
self._calculate_ratio(expr_A, expr_B)
|
|
767
|
+
).otherwise(None)
|
|
768
|
+
|
|
769
|
+
ratio_ac_expr = pl.when(num_parts_expr == 3).then(
|
|
770
|
+
self._calculate_ratio(expr_A, expr_C)
|
|
771
|
+
).when(num_parts_expr == 2).then(
|
|
772
|
+
self._calculate_ratio(expr_A, expr_B) # B is actually C in this case
|
|
773
|
+
).otherwise(None)
|
|
774
|
+
|
|
775
|
+
# Execute the expressions and return the final DataFrame
|
|
776
|
+
return temp_df.select(
|
|
777
|
+
A_div_B=ratio_ab_expr,
|
|
778
|
+
A_div_C=ratio_ac_expr
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
|
|
665
782
|
class CategoryMapper:
|
|
666
783
|
"""
|
|
667
784
|
A transformer that maps string categories to specified numerical values using a dictionary.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|