dragon-ml-toolbox 3.1.0__tar.gz → 3.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (30) hide show
  1. {dragon_ml_toolbox-3.1.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.2.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ETL_engineering.py +111 -21
  4. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/pyproject.toml +1 -1
  5. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/LICENSE +0 -0
  6. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/LICENSE-THIRD-PARTY.md +0 -0
  7. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/README.md +0 -0
  8. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  9. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  10. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  11. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  12. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/GUI_tools.py +0 -0
  13. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/MICE_imputation.py +0 -0
  14. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ML_callbacks.py +0 -0
  15. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ML_evaluation.py +0 -0
  16. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ML_trainer.py +0 -0
  17. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ML_tutorial.py +0 -0
  18. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/PSO_optimization.py +0 -0
  19. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/RNN_forecast.py +0 -0
  20. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/VIF_factor.py +0 -0
  21. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/__init__.py +0 -0
  22. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/_particle_swarm_optimization.py +0 -0
  23. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/_pytorch_models.py +0 -0
  24. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/data_exploration.py +0 -0
  25. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/datasetmaster.py +0 -0
  26. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ensemble_learning.py +0 -0
  27. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/handle_excel.py +0 -0
  28. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/logger.py +0 -0
  29. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/ml_tools/utilities.py +0 -0
  30. {dragon_ml_toolbox-3.1.0 → dragon_ml_toolbox-3.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.1.0
3
+ Version: 3.2.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.1.0
3
+ Version: 3.2.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  import polars as pl
2
2
  import re
3
- from typing import Literal, Union, Optional, Any, Callable, List, Dict
3
+ from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
4
4
  from .utilities import _script_info
5
5
  import pandas as pd
6
6
  from .logger import _LOGGER
@@ -11,6 +11,7 @@ __all__ = [
11
11
  "DataFrameCleaner",
12
12
  "TransformationRecipe",
13
13
  "DataProcessor",
14
+ "BinaryTransformer",
14
15
  "KeywordDummifier",
15
16
  "NumberExtractor",
16
17
  "MultiNumberExtractor",
@@ -144,7 +145,7 @@ class DataFrameCleaner:
144
145
  return df_cleaned
145
146
 
146
147
 
147
- ############ TRANSFORM ####################
148
+ ############ TRANSFORM MAIN ####################
148
149
 
149
150
  # Magic word for rename-only transformation
150
151
  _RENAME = "rename"
@@ -329,6 +330,75 @@ class DataProcessor:
329
330
  """
330
331
  print(self)
331
332
 
333
+ ############ TRANSFORMERS ####################
334
+
335
+ class BinaryTransformer:
336
+ """
337
+ A transformer that maps string values to a binary 1 or 0 based on keyword matching.
338
+
339
+ Must supply a list of keywords for either the 'true' case (1) or the 'false' case (0), but not both.
340
+
341
+ Args:
342
+ true_keywords (List[str] | None):
343
+ If a string contains any of these keywords, the output is 1, otherwise 0.
344
+ false_keywords (List[str] | None):
345
+ If a string contains any of these keywords, the output is 0, otherwise 1.
346
+ """
347
+ def __init__(
348
+ self,
349
+ true_keywords: Optional[List[str]] = None,
350
+ false_keywords: Optional[List[str]] = None,
351
+ case_insensitive: bool = True,
352
+ ):
353
+ # --- Validation: Enforce one and only one option ---
354
+ if true_keywords is not None and false_keywords is not None:
355
+ raise ValueError(
356
+ "Provide either 'true_keywords' or 'false_keywords', but not both."
357
+ )
358
+ if true_keywords is None and false_keywords is None:
359
+ raise ValueError(
360
+ "You must provide either 'true_keywords' or 'false_keywords'."
361
+ )
362
+
363
+ # --- Configuration ---
364
+ self.keywords: List[str] = true_keywords if true_keywords is not None else false_keywords # type: ignore
365
+ if not self.keywords:
366
+ raise ValueError("Keyword list cannot be empty.")
367
+
368
+ self.mode: str = "true_mode" if true_keywords is not None else "false_mode"
369
+
370
+ # --- Create the regex string pattern ---
371
+ # Escape keywords to treat them as literals
372
+ base_pattern = "|".join(re.escape(k) for k in self.keywords)
373
+
374
+ # For polars, add case-insensitivity flag `(?i)` to the pattern string itself
375
+ if case_insensitive:
376
+ self.pattern = f"(?i){base_pattern}"
377
+ else:
378
+ self.pattern = base_pattern
379
+
380
+
381
+ def __call__(self, column: pl.Series) -> pl.Series:
382
+ """
383
+ Applies the binary mapping logic to the input column.
384
+
385
+ Args:
386
+ column (pl.Series): The input Polars Series of string data.
387
+
388
+ Returns:
389
+ pl.Series: A new Series of type UInt8 containing 1s and 0s.
390
+ """
391
+ # Create a boolean Series: True if any keyword is found, else False
392
+ contains_keyword = column.str.contains(self.pattern)
393
+
394
+ # Apply logic and cast directly to integer type
395
+ if self.mode == "true_mode":
396
+ # True -> 1, False -> 0
397
+ return contains_keyword.cast(pl.UInt8)
398
+ else: # false_mode
399
+ # We want the inverse: True -> 0, False -> 1
400
+ return (~contains_keyword).cast(pl.UInt8)
401
+
332
402
 
333
403
  class KeywordDummifier:
334
404
  """
@@ -345,13 +415,16 @@ class KeywordDummifier:
345
415
  group_keywords (List[List[str]]):
346
416
  A list of lists of strings. Each inner list corresponds to a
347
417
  `group_name` at the same index and contains the keywords to search for.
418
+ case_insensitive (bool):
419
+ If True, keyword matching ignores case.
348
420
  """
349
- def __init__(self, group_names: List[str], group_keywords: List[List[str]]):
421
+ def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True):
350
422
  if len(group_names) != len(group_keywords):
351
423
  raise ValueError("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
352
424
 
353
425
  self.group_names = group_names
354
426
  self.group_keywords = group_keywords
427
+ self.case_insensitive = case_insensitive
355
428
 
356
429
  def __call__(self, column: pl.Series) -> pl.DataFrame:
357
430
  """
@@ -365,9 +438,18 @@ class KeywordDummifier:
365
438
  """
366
439
  column = column.cast(pl.Utf8)
367
440
 
368
- categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None))
441
+ categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None, dtype=pl.Utf8))
442
+
369
443
  for name, keywords in zip(self.group_names, self.group_keywords):
370
- pattern = "|".join(re.escape(k) for k in keywords)
444
+ # Create the base regex pattern by escaping and joining keywords
445
+ base_pattern = "|".join(re.escape(k) for k in keywords)
446
+
447
+ # Add the case-insensitive flag `(?i)` to the pattern string
448
+ if self.case_insensitive:
449
+ pattern = f"(?i){base_pattern}"
450
+ else:
451
+ pattern = base_pattern
452
+
371
453
  categorize_expr = categorize_expr.when(
372
454
  column.str.contains(pattern)
373
455
  ).then(pl.lit(name))
@@ -386,6 +468,7 @@ class KeywordDummifier:
386
468
  df_with_dummies.get_column(dummy_col_name).alias(name)
387
469
  )
388
470
  else:
471
+ # If a group had no matches, create a column of zeros
389
472
  final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
390
473
 
391
474
  return pl.DataFrame(final_columns)
@@ -661,33 +744,42 @@ class RegexMapper:
661
744
  "first match wins" logic makes the order of the mapping important.
662
745
 
663
746
  Args:
664
- mapping (Dict[str, Union[int, float]]):
747
+ mapping (Dict[str, [int | float]]):
665
748
  An ordered dictionary where keys are regex patterns and values are
666
749
  the numbers to map to if the pattern is found.
667
- unseen_value (Optional[Union[int, float]], optional):
750
+ unseen_value (int | float | None):
668
751
  The numerical value to use for strings that do not match any
669
- of the regex patterns. If None (default), unseen values are
670
- mapped to null.
752
+ of the regex patterns. If None, unseen values are mapped to null.
753
+ case_insensitive (bool):
754
+ If True , the regex matching for all patterns will ignore case.
671
755
  """
672
756
  def __init__(
673
757
  self,
674
758
  mapping: Dict[str, Union[int, float]],
675
759
  unseen_value: Optional[Union[int, float]] = None,
760
+ case_insensitive: bool = True,
676
761
  ):
677
762
  # --- Validation ---
678
763
  if not isinstance(mapping, dict):
679
764
  raise TypeError("The 'mapping' argument must be a dictionary.")
765
+
766
+ self.unseen_value = unseen_value
680
767
 
768
+ # --- Process and validate patterns ---
769
+ # Process patterns here to be more efficient, avoiding reprocessing on every __call__.
770
+ self.processed_mapping: List[Tuple[str, Union[int, float]]] = []
681
771
  for pattern, value in mapping.items():
772
+ final_pattern = f"(?i){pattern}" if case_insensitive else pattern
773
+
774
+ # Validate the final pattern that will actually be used by Polars
682
775
  try:
683
- re.compile(pattern)
776
+ re.compile(final_pattern)
684
777
  except re.error as e:
685
- raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
778
+ raise ValueError(f"Invalid regex pattern '{final_pattern}': {e}") from e
686
779
  if not isinstance(value, (int, float)):
687
780
  raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
688
-
689
- self.mapping = mapping
690
- self.unseen_value = unseen_value
781
+
782
+ self.processed_mapping.append((final_pattern, value))
691
783
 
692
784
  def __call__(self, column: pl.Series) -> pl.Series:
693
785
  """
@@ -700,22 +792,20 @@ class RegexMapper:
700
792
  pl.Series: A new Series with strings mapped to numbers based on
701
793
  the first matching regex pattern.
702
794
  """
703
- # Ensure the column is treated as a string for matching
704
- str_column = column.cast(pl.Utf8)
795
+ # pl.String is the modern alias for pl.Utf8
796
+ str_column = column.cast(pl.String)
705
797
 
706
- # Build the when/then/otherwise chain from the inside out.
707
- # Start with the final fallback value for non-matches.
798
+ # Start with the fallback value for non-matches.
708
799
  mapping_expr = pl.lit(self.unseen_value)
709
800
 
710
- # Iterate through the mapping in reverse to construct the nested expression
711
- for pattern, value in reversed(list(self.mapping.items())):
801
+ # Iterate through the processed mapping in reverse to construct the nested expression
802
+ for pattern, value in reversed(self.processed_mapping):
712
803
  mapping_expr = (
713
804
  pl.when(str_column.str.contains(pattern))
714
805
  .then(pl.lit(value))
715
806
  .otherwise(mapping_expr)
716
807
  )
717
808
 
718
- # Execute the complete expression chain and return the resulting Series
719
809
  return pl.select(mapping_expr).to_series()
720
810
 
721
811
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "3.1.0"
3
+ version = "3.2.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }