dragon-ml-toolbox 3.1.0__py3-none-any.whl → 3.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.1.0
3
+ Version: 3.2.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
- dragon_ml_toolbox-3.1.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-3.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
- ml_tools/ETL_engineering.py,sha256=qxLbw8Vc0lOHUJm5ou280Tvw3oh_G1UHonxfa7nu_4Q,33008
1
+ dragon_ml_toolbox-3.2.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-3.2.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
+ ml_tools/ETL_engineering.py,sha256=Y1EW_OeUiJju-BA31tnmetV_09hDLvgrE2Rk31TnDOU,37270
4
4
  ml_tools/GUI_tools.py,sha256=uFx6zIrQZzDPSTtOSHz8ptz-fxZiQz-lXHcrqwuYV_E,20385
5
5
  ml_tools/MICE_imputation.py,sha256=ed-YeQkEAeHxTNkWIHs09T4YeYNF0aqAnrUTcdIEp9E,11372
6
6
  ml_tools/ML_callbacks.py,sha256=gHZk-lyzAax6iEtG26zHuoobdAZCFJ6BmI6pWoXkOrw,13189
@@ -19,7 +19,7 @@ ml_tools/ensemble_learning.py,sha256=PPtBBLgLvaYOdY-MlcjXuxWWXf3JQavLNEysFgzjc_s
19
19
  ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
20
20
  ml_tools/logger.py,sha256=jC4Q2OqmDm8ZO9VpuZqBSWdXryqaJvLscqVJ6caNMOk,6009
21
21
  ml_tools/utilities.py,sha256=opNR-ACH6BnLkWAKcb19ef5tFxfx22TI6E2o0RYwiGA,21021
22
- dragon_ml_toolbox-3.1.0.dist-info/METADATA,sha256=yGk-slwRhPF23NxfVG0vR0NeIQbo_mJ-_ZEIomBLvrQ,3273
23
- dragon_ml_toolbox-3.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
- dragon_ml_toolbox-3.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
25
- dragon_ml_toolbox-3.1.0.dist-info/RECORD,,
22
+ dragon_ml_toolbox-3.2.1.dist-info/METADATA,sha256=qPGzA1lRzS1MC5RGxI1uuUQyGyOC1r6tw87xkwe2Cyw,3273
23
+ dragon_ml_toolbox-3.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
+ dragon_ml_toolbox-3.2.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
25
+ dragon_ml_toolbox-3.2.1.dist-info/RECORD,,
@@ -1,6 +1,6 @@
1
1
  import polars as pl
2
2
  import re
3
- from typing import Literal, Union, Optional, Any, Callable, List, Dict
3
+ from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
4
4
  from .utilities import _script_info
5
5
  import pandas as pd
6
6
  from .logger import _LOGGER
@@ -11,6 +11,7 @@ __all__ = [
11
11
  "DataFrameCleaner",
12
12
  "TransformationRecipe",
13
13
  "DataProcessor",
14
+ "BinaryTransformer",
14
15
  "KeywordDummifier",
15
16
  "NumberExtractor",
16
17
  "MultiNumberExtractor",
@@ -144,7 +145,7 @@ class DataFrameCleaner:
144
145
  return df_cleaned
145
146
 
146
147
 
147
- ############ TRANSFORM ####################
148
+ ############ TRANSFORM MAIN ####################
148
149
 
149
150
  # Magic word for rename-only transformation
150
151
  _RENAME = "rename"
@@ -329,6 +330,75 @@ class DataProcessor:
329
330
  """
330
331
  print(self)
331
332
 
333
+ ############ TRANSFORMERS ####################
334
+
335
+ class BinaryTransformer:
336
+ """
337
+ A transformer that maps string values to a binary 1 or 0 based on keyword matching.
338
+
339
+ Must supply a list of keywords for either the 'true' case (1) or the 'false' case (0), but not both.
340
+
341
+ Args:
342
+ true_keywords (List[str] | None):
343
+ If a string contains any of these keywords, the output is 1, otherwise 0.
344
+ false_keywords (List[str] | None):
345
+ If a string contains any of these keywords, the output is 0, otherwise 1.
346
+ """
347
+ def __init__(
348
+ self,
349
+ true_keywords: Optional[List[str]] = None,
350
+ false_keywords: Optional[List[str]] = None,
351
+ case_insensitive: bool = True,
352
+ ):
353
+ # --- Validation: Enforce one and only one option ---
354
+ if true_keywords is not None and false_keywords is not None:
355
+ raise ValueError(
356
+ "Provide either 'true_keywords' or 'false_keywords', but not both."
357
+ )
358
+ if true_keywords is None and false_keywords is None:
359
+ raise ValueError(
360
+ "You must provide either 'true_keywords' or 'false_keywords'."
361
+ )
362
+
363
+ # --- Configuration ---
364
+ self.keywords: List[str] = true_keywords if true_keywords is not None else false_keywords # type: ignore
365
+ if not self.keywords:
366
+ raise ValueError("Keyword list cannot be empty.")
367
+
368
+ self.mode: str = "true_mode" if true_keywords is not None else "false_mode"
369
+
370
+ # --- Create the regex string pattern ---
371
+ # Escape keywords to treat them as literals
372
+ base_pattern = "|".join(re.escape(k) for k in self.keywords)
373
+
374
+ # For polars, add case-insensitivity flag `(?i)` to the pattern string itself
375
+ if case_insensitive:
376
+ self.pattern = f"(?i){base_pattern}"
377
+ else:
378
+ self.pattern = base_pattern
379
+
380
+
381
+ def __call__(self, column: pl.Series) -> pl.Series:
382
+ """
383
+ Applies the binary mapping logic to the input column.
384
+
385
+ Args:
386
+ column (pl.Series): The input Polars Series of string data.
387
+
388
+ Returns:
389
+ pl.Series: A new Series of type UInt8 containing 1s and 0s.
390
+ """
391
+ # Create a boolean Series: True if any keyword is found, else False
392
+ contains_keyword = column.str.contains(self.pattern)
393
+
394
+ # Apply logic and cast directly to integer type
395
+ if self.mode == "true_mode":
396
+ # True -> 1, False -> 0
397
+ return contains_keyword.cast(pl.UInt8)
398
+ else: # false_mode
399
+ # We want the inverse: True -> 0, False -> 1
400
+ return (~contains_keyword).cast(pl.UInt8)
401
+
332
402
 
333
403
  class KeywordDummifier:
334
404
  """
@@ -345,13 +415,19 @@ class KeywordDummifier:
345
415
  group_keywords (List[List[str]]):
346
416
  A list of lists of strings. Each inner list corresponds to a
347
417
  `group_name` at the same index and contains the keywords to search for.
418
+ case_insensitive (bool):
419
+ If True, keyword matching ignores case.
420
+ drop_empty (bool):
421
+ If True, columns that contain no positive matches (all zeros) will be dropped from the final output.
348
422
  """
349
- def __init__(self, group_names: List[str], group_keywords: List[List[str]]):
423
+ def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True, drop_empty: bool = True):
350
424
  if len(group_names) != len(group_keywords):
351
425
  raise ValueError("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
352
426
 
353
427
  self.group_names = group_names
354
428
  self.group_keywords = group_keywords
429
+ self.case_insensitive = case_insensitive
430
+ self.drop_empty = drop_empty
355
431
 
356
432
  def __call__(self, column: pl.Series) -> pl.DataFrame:
357
433
  """
@@ -365,9 +441,18 @@ class KeywordDummifier:
365
441
  """
366
442
  column = column.cast(pl.Utf8)
367
443
 
368
- categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None))
444
+ categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None, dtype=pl.Utf8))
445
+
369
446
  for name, keywords in zip(self.group_names, self.group_keywords):
370
- pattern = "|".join(re.escape(k) for k in keywords)
447
+ # Create the base regex pattern by escaping and joining keywords
448
+ base_pattern = "|".join(re.escape(k) for k in keywords)
449
+
450
+ # Add the case-insensitive flag `(?i)` to the pattern string
451
+ if self.case_insensitive:
452
+ pattern = f"(?i){base_pattern}"
453
+ else:
454
+ pattern = base_pattern
455
+
371
456
  categorize_expr = categorize_expr.when(
372
457
  column.str.contains(pattern)
373
458
  ).then(pl.lit(name))
@@ -386,9 +471,19 @@ class KeywordDummifier:
386
471
  df_with_dummies.get_column(dummy_col_name).alias(name)
387
472
  )
388
473
  else:
474
+ # If a group had no matches, create a column of zeros
389
475
  final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
390
476
 
391
- return pl.DataFrame(final_columns)
477
+ # First, create a full DataFrame with all potential columns
478
+ result_df = pl.DataFrame(final_columns)
479
+
480
+ # If drop_empty is True, filter out all-zero columns
481
+ if self.drop_empty:
482
+ # A column is kept if its sum is greater than 0
483
+ cols_to_keep = [col for col in result_df if col.sum() > 0]
484
+ return result_df.select(cols_to_keep)
485
+
486
+ return result_df
392
487
 
393
488
 
394
489
  class NumberExtractor:
@@ -661,33 +756,42 @@ class RegexMapper:
661
756
  "first match wins" logic makes the order of the mapping important.
662
757
 
663
758
  Args:
664
- mapping (Dict[str, Union[int, float]]):
759
+ mapping (Dict[str, [int | float]]):
665
760
  An ordered dictionary where keys are regex patterns and values are
666
761
  the numbers to map to if the pattern is found.
667
- unseen_value (Optional[Union[int, float]], optional):
762
+ unseen_value (int | float | None):
668
763
  The numerical value to use for strings that do not match any
669
- of the regex patterns. If None (default), unseen values are
670
- mapped to null.
764
+ of the regex patterns. If None, unseen values are mapped to null.
765
+ case_insensitive (bool):
766
+ If True , the regex matching for all patterns will ignore case.
671
767
  """
672
768
  def __init__(
673
769
  self,
674
770
  mapping: Dict[str, Union[int, float]],
675
771
  unseen_value: Optional[Union[int, float]] = None,
772
+ case_insensitive: bool = True,
676
773
  ):
677
774
  # --- Validation ---
678
775
  if not isinstance(mapping, dict):
679
776
  raise TypeError("The 'mapping' argument must be a dictionary.")
777
+
778
+ self.unseen_value = unseen_value
680
779
 
780
+ # --- Process and validate patterns ---
781
+ # Process patterns here to be more efficient, avoiding reprocessing on every __call__.
782
+ self.processed_mapping: List[Tuple[str, Union[int, float]]] = []
681
783
  for pattern, value in mapping.items():
784
+ final_pattern = f"(?i){pattern}" if case_insensitive else pattern
785
+
786
+ # Validate the final pattern that will actually be used by Polars
682
787
  try:
683
- re.compile(pattern)
788
+ re.compile(final_pattern)
684
789
  except re.error as e:
685
- raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
790
+ raise ValueError(f"Invalid regex pattern '{final_pattern}': {e}") from e
686
791
  if not isinstance(value, (int, float)):
687
792
  raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
688
-
689
- self.mapping = mapping
690
- self.unseen_value = unseen_value
793
+
794
+ self.processed_mapping.append((final_pattern, value))
691
795
 
692
796
  def __call__(self, column: pl.Series) -> pl.Series:
693
797
  """
@@ -700,22 +804,20 @@ class RegexMapper:
700
804
  pl.Series: A new Series with strings mapped to numbers based on
701
805
  the first matching regex pattern.
702
806
  """
703
- # Ensure the column is treated as a string for matching
704
- str_column = column.cast(pl.Utf8)
807
+ # pl.String is the modern alias for pl.Utf8
808
+ str_column = column.cast(pl.String)
705
809
 
706
- # Build the when/then/otherwise chain from the inside out.
707
- # Start with the final fallback value for non-matches.
810
+ # Start with the fallback value for non-matches.
708
811
  mapping_expr = pl.lit(self.unseen_value)
709
812
 
710
- # Iterate through the mapping in reverse to construct the nested expression
711
- for pattern, value in reversed(list(self.mapping.items())):
813
+ # Iterate through the processed mapping in reverse to construct the nested expression
814
+ for pattern, value in reversed(self.processed_mapping):
712
815
  mapping_expr = (
713
816
  pl.when(str_column.str.contains(pattern))
714
817
  .then(pl.lit(value))
715
818
  .otherwise(mapping_expr)
716
819
  )
717
820
 
718
- # Execute the complete expression chain and return the resulting Series
719
821
  return pl.select(mapping_expr).to_series()
720
822
 
721
823