dragon-ml-toolbox 10.3.0__tar.gz → 10.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (41) hide show
  1. {dragon_ml_toolbox-10.3.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.4.1}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ETL_engineering.py +199 -4
  4. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/path_manager.py +1 -1
  5. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/pyproject.toml +1 -1
  6. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/LICENSE +0 -0
  7. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/LICENSE-THIRD-PARTY.md +0 -0
  8. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/README.md +0 -0
  9. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  10. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  11. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  12. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  13. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ETL_cleaning.py +0 -0
  14. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/GUI_tools.py +0 -0
  15. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/MICE_imputation.py +0 -0
  16. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ML_callbacks.py +0 -0
  17. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ML_datasetmaster.py +0 -0
  18. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ML_evaluation.py +0 -0
  19. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ML_evaluation_multi.py +0 -0
  20. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ML_inference.py +0 -0
  21. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ML_models.py +0 -0
  22. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ML_optimization.py +0 -0
  23. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ML_scaler.py +0 -0
  24. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ML_trainer.py +0 -0
  25. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/PSO_optimization.py +0 -0
  26. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/RNN_forecast.py +0 -0
  27. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/SQL.py +0 -0
  28. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/VIF_factor.py +0 -0
  29. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/__init__.py +0 -0
  30. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/_logger.py +0 -0
  31. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/_script_info.py +0 -0
  32. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/custom_logger.py +0 -0
  33. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/data_exploration.py +0 -0
  34. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ensemble_evaluation.py +0 -0
  35. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ensemble_inference.py +0 -0
  36. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/ensemble_learning.py +0 -0
  37. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/handle_excel.py +0 -0
  38. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/keys.py +0 -0
  39. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/optimization_tools.py +0 -0
  40. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/ml_tools/utilities.py +0 -0
  41. {dragon_ml_toolbox-10.3.0 → dragon_ml_toolbox-10.4.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.3.0
3
+ Version: 10.4.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.3.0
3
+ Version: 10.4.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -17,7 +17,10 @@ __all__ = [
17
17
  "KeywordDummifier",
18
18
  "NumberExtractor",
19
19
  "MultiNumberExtractor",
20
+ "TemperatureExtractor",
21
+ "MultiTemperatureExtractor",
20
22
  "RatioCalculator",
23
+ "TriRatioCalculator",
21
24
  "CategoryMapper",
22
25
  "RegexMapper",
23
26
  "ValueBinner",
@@ -324,6 +327,15 @@ class AutoDummifier:
324
327
  A transformer that performs one-hot encoding on a categorical column,
325
328
  automatically detecting the unique categories from the data.
326
329
  """
330
+ def __init__(self, drop_first: bool = False):
331
+ """
332
+ Initializes the AutoDummifier.
333
+
334
+ Args:
335
+ drop_first (bool): If True, drops the first dummy column.
336
+ """
337
+ self.drop_first = drop_first
338
+
327
339
  def __call__(self, column: pl.Series) -> pl.DataFrame:
328
340
  """
329
341
  Executes the one-hot encoding logic.
@@ -337,7 +349,7 @@ class AutoDummifier:
337
349
  '{original_col_name}_{category_value}'.
338
350
  """
339
351
  # Ensure the column is treated as a string before creating dummies
340
- return column.cast(pl.Utf8).to_dummies()
352
+ return column.cast(pl.Utf8).to_dummies(drop_first=self.drop_first)
341
353
 
342
354
 
343
355
  class MultiBinaryDummifier:
@@ -639,6 +651,189 @@ class MultiNumberExtractor:
639
651
  return pl.select(output_expressions)
640
652
 
641
653
 
654
+ class TemperatureExtractor:
655
+ """
656
+ Extracts temperature values from a string column.
657
+
658
+ This transformer assumes that the source temperature values are in Celsius.
659
+ It can extract a single value using a specific regex or find all numbers in
660
+ a string and calculate their average. It also supports converting the final
661
+ Celsius value to Kelvin or Rankine.
662
+
663
+ Args:
664
+ regex_pattern (str):
665
+ The regex to find a single temperature. MUST contain exactly one
666
+ capturing group `(...)`. This is ignored if `average_mode` is True.
667
+ average_mode (bool):
668
+ If True, extracts all numbers from the string and returns their average.
669
+ This overrides the `regex_pattern` with a generic number-finding regex.
670
+ convert (str | None):
671
+ If "K", converts the final Celsius value to Kelvin.
672
+ If "R", converts the final Celsius value to Rankine.
673
+ If None (default), the value remains in Celsius.
674
+ """
675
+ def __init__(
676
+ self,
677
+ regex_pattern: str = r"(\d+\.?\d*)",
678
+ average_mode: bool = False,
679
+ convert: Optional[Literal["K", "R"]] = None,
680
+ ):
681
+ # --- Store configuration ---
682
+ self.average_mode = average_mode
683
+ self.convert = convert
684
+ self.regex_pattern = regex_pattern
685
+
686
+ # Generic pattern for average mode, defined once for efficiency.
687
+ self._avg_mode_pattern = r"(\d+\.?\d*)"
688
+
689
+ # --- Validation ---
690
+ if not self.average_mode:
691
+ try:
692
+ if re.compile(self.regex_pattern).groups != 1:
693
+ _LOGGER.error("'regex_pattern' must contain exactly one capturing group '(...)' for single extraction mode.")
694
+ raise ValueError()
695
+ except re.error as e:
696
+ _LOGGER.error(f"Invalid regex pattern provided: {e}")
697
+ raise ValueError()
698
+
699
+ if self.convert is not None and self.convert not in ["K", "R"]:
700
+ _LOGGER.error("'convert' must be either 'K' (Kelvin) or 'R' (Rankine).")
701
+ raise ValueError()
702
+
703
+ def __call__(self, column: pl.Series) -> pl.Series:
704
+ """
705
+ Applies the temperature extraction and conversion logic.
706
+
707
+ Args:
708
+ column (pl.Series): The input Polars Series with string data.
709
+
710
+ Returns:
711
+ pl.Series: A new Series containing the final temperature values as floats.
712
+ """
713
+ # --- Step 1: Extract number(s) to get a Celsius value expression ---
714
+ if self.average_mode:
715
+ # Extract all numbers and compute their mean. Polars' list.mean()
716
+ # handles the casting to float automatically.
717
+ celsius_expr = column.str.extract_all(self._avg_mode_pattern).list.mean()
718
+ else:
719
+ # Extract a single number using the specified pattern.
720
+ # Cast to Float64, with non-matches becoming null.
721
+ celsius_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
722
+
723
+ # --- Step 2: Apply conversion if specified ---
724
+ if self.convert == "K":
725
+ # Celsius to Kelvin: C + 273.15
726
+ final_expr = celsius_expr + 273.15
727
+ elif self.convert == "R":
728
+ # Celsius to Rankine: (C * 9/5) + 491.67
729
+ final_expr = (celsius_expr * 1.8) + 491.67
730
+ else:
731
+ # No conversion needed
732
+ final_expr = celsius_expr
733
+
734
+ # --- Step 3: Round the result and return as a Series ---
735
+ # The select().to_series() pattern is a robust way to execute an
736
+ # expression and guarantee a Series is returned.
737
+ return pl.select(final_expr.round(2)).to_series()
738
+
739
+
740
+ class MultiTemperatureExtractor:
741
+ """
742
+ Extracts multiple temperature values from a single string column into
743
+ several new columns, assuming the source values are in Celsius.
744
+
745
+ This one-to-many transformer is designed for cases where multiple readings
746
+ are packed into one field, like "Min: 10C, Max: 25C".
747
+
748
+ Args:
749
+ num_outputs (int):
750
+ The number of numeric columns to create.
751
+ regex_pattern (str):
752
+ The regex to find all numbers. Must contain exactly one capturing
753
+ group around the number part (e.g., r"(-?\\d+\\.?\\d*)").
754
+ convert (str | None):
755
+ If "K", converts the final Celsius values to Kelvin.
756
+ If "R", converts the final Celsius values to Rankine.
757
+ If None (default), the values remain in Celsius.
758
+ fill_value (int | float | None):
759
+ A value to use if a temperature is not found at a given position.
760
+ For example, if `num_outputs=3` and only two temperatures are
761
+ found, the third column will be filled with this value. If None,
762
+ it will be filled with null.
763
+ """
764
+ def __init__(
765
+ self,
766
+ num_outputs: int,
767
+ regex_pattern: str = r"(\d+\.?\d*)",
768
+ convert: Optional[Literal["K", "R"]] = None,
769
+ fill_value: Optional[Union[int, float]] = None
770
+ ):
771
+ # --- Validation ---
772
+ if not isinstance(num_outputs, int) or num_outputs <= 0:
773
+ _LOGGER.error("'num_outputs' must be a positive integer.")
774
+ raise ValueError()
775
+
776
+ try:
777
+ if re.compile(regex_pattern).groups != 1:
778
+ _LOGGER.error("'regex_pattern' must contain exactly one capturing group '(...)'.")
779
+ raise ValueError()
780
+ except re.error as e:
781
+ _LOGGER.error(f"Invalid regex pattern provided: {e}")
782
+ raise ValueError()
783
+
784
+ if convert is not None and convert not in ["K", "R"]:
785
+ _LOGGER.error("'convert' must be either 'K' (Kelvin) or 'R' (Rankine).")
786
+ raise ValueError()
787
+
788
+ # --- Store configuration ---
789
+ self.num_outputs = num_outputs
790
+ self.regex_pattern = regex_pattern
791
+ self.convert = convert
792
+ self.fill_value = fill_value
793
+
794
+ def __call__(self, column: pl.Series) -> pl.DataFrame:
795
+ """
796
+ Applies the multi-temperature extraction and conversion logic.
797
+ """
798
+ output_expressions = []
799
+ for i in range(self.num_outputs):
800
+ # --- Step 1: Extract the i-th number as a Celsius value ---
801
+ celsius_expr = (
802
+ column.str.extract_all(self.regex_pattern)
803
+ .list.get(i, null_on_oob=True)
804
+ .cast(pl.Float64, strict=False)
805
+ )
806
+
807
+ # --- Step 2: Apply conversion if specified ---
808
+ if self.convert == "K":
809
+ # Celsius to Kelvin: C + 273.15
810
+ converted_expr = celsius_expr + 273.15
811
+ elif self.convert == "R":
812
+ # Celsius to Rankine: (C * 9/5) + 491.67
813
+ converted_expr = (celsius_expr * 1.8) + 491.67
814
+ else:
815
+ # No conversion needed
816
+ converted_expr = celsius_expr
817
+
818
+ # --- Step 3: Apply fill value and handle original nulls ---
819
+ final_expr = converted_expr.round(2)
820
+ if self.fill_value is not None:
821
+ final_expr = final_expr.fill_null(self.fill_value)
822
+
823
+ # Ensure that if the original row was null, all outputs are null
824
+ final_expr = (
825
+ pl.when(column.is_not_null())
826
+ .then(final_expr)
827
+ .otherwise(None)
828
+ .alias(f"col_{i}") # Temporary name for DataProcessor
829
+ )
830
+
831
+ output_expressions.append(final_expr)
832
+
833
+ # Execute all expressions at once for performance
834
+ return pl.select(output_expressions)
835
+
836
+
642
837
  class RatioCalculator:
643
838
  """
644
839
  A transformer that parses a string ratio (e.g., "40:5" or "30/2") and
@@ -757,9 +952,9 @@ class TriRatioCalculator:
757
952
  all_numbers_expr = pl.col(column.name).str.extract_all(r"(\d+\.?\d*)")
758
953
  num_parts_expr = all_numbers_expr.list.len()
759
954
 
760
- expr_A = all_numbers_expr.list.get(0).cast(pl.Float64)
761
- expr_B = all_numbers_expr.list.get(1).cast(pl.Float64)
762
- expr_C = all_numbers_expr.list.get(2).cast(pl.Float64)
955
+ expr_A = all_numbers_expr.list.get(0, null_on_oob=True).cast(pl.Float64)
956
+ expr_B = all_numbers_expr.list.get(1, null_on_oob=True).cast(pl.Float64)
957
+ expr_C = all_numbers_expr.list.get(2, null_on_oob=True).cast(pl.Float64)
763
958
 
764
959
  # Define logic for each output column using expressions
765
960
  ratio_ab_expr = pl.when(num_parts_expr == 3).then(
@@ -20,7 +20,7 @@ class PathManager:
20
20
  """
21
21
  Manages and stores a project's file paths, acting as a centralized
22
22
  "path database". It supports both development mode and applications
23
- bundled with Pyinstaller.
23
+ bundled with Pyinstaller or Nuitka.
24
24
 
25
25
  Supports python dictionary syntax.
26
26
  """
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "10.3.0"
3
+ version = "10.4.1"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }