dragon-ml-toolbox 3.0.0__tar.gz → 3.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (30) hide show
  1. {dragon_ml_toolbox-3.0.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.2.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ETL_engineering.py +152 -33
  4. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/data_exploration.py +2 -2
  5. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/pyproject.toml +1 -1
  6. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/LICENSE +0 -0
  7. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/LICENSE-THIRD-PARTY.md +0 -0
  8. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/README.md +0 -0
  9. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  10. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  11. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  12. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  13. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/GUI_tools.py +0 -0
  14. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/MICE_imputation.py +0 -0
  15. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ML_callbacks.py +0 -0
  16. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ML_evaluation.py +0 -0
  17. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ML_trainer.py +0 -0
  18. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ML_tutorial.py +0 -0
  19. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/PSO_optimization.py +0 -0
  20. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/RNN_forecast.py +0 -0
  21. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/VIF_factor.py +0 -0
  22. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/__init__.py +0 -0
  23. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/_particle_swarm_optimization.py +0 -0
  24. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/_pytorch_models.py +0 -0
  25. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/datasetmaster.py +0 -0
  26. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ensemble_learning.py +0 -0
  27. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/handle_excel.py +0 -0
  28. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/logger.py +0 -0
  29. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/utilities.py +0 -0
  30. {dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.0.0
3
+ Version: 3.2.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.0.0
3
+ Version: 3.2.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  import polars as pl
2
2
  import re
3
- from typing import Literal, Union, Optional, Any, Callable, List, Dict
3
+ from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
4
4
  from .utilities import _script_info
5
5
  import pandas as pd
6
6
  from .logger import _LOGGER
@@ -11,6 +11,7 @@ __all__ = [
11
11
  "DataFrameCleaner",
12
12
  "TransformationRecipe",
13
13
  "DataProcessor",
14
+ "BinaryTransformer",
14
15
  "KeywordDummifier",
15
16
  "NumberExtractor",
16
17
  "MultiNumberExtractor",
@@ -25,18 +26,26 @@ __all__ = [
25
26
 
26
27
  class ColumnCleaner:
27
28
  """
28
- Cleans and standardizes a single pandas Series based on a dictionary of regex-to-value replacement rules.
29
+ Cleans and standardizes a pandas Series by applying regex-to-replacement rules.
30
+ Supports sub-string replacements and case-insensitivity.
31
+
32
+ Notes:
33
+ - Write separate, specific rules for each case. Don't combine patterns with an "OR".
34
+ - Define rules from most specific to more general to create a fallback system.
35
+ - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
29
36
 
30
37
  Args:
31
38
  rules (Dict[str, str]):
32
- A dictionary where each key is a regular expression pattern and
33
- each value is the standardized string to replace matches with.
39
+ A dictionary of regex patterns to replacement strings. Can use
40
+ backreferences in the replacement statement (e.g., r'\\1 \\2 \\3 \\4 \\5') for captured groups.
41
+ case_insensitive (bool):
42
+ If True, regex matching ignores case.
34
43
  """
35
- def __init__(self, rules: Dict[str, str]):
44
+ def __init__(self, rules: Dict[str, str], case_insensitive: bool = True):
36
45
  if not isinstance(rules, dict):
37
46
  raise TypeError("The 'rules' argument must be a dictionary.")
38
47
 
39
- # Validate that all keys are valid regular expressions
48
+ # Validate regex patterns
40
49
  for pattern in rules.keys():
41
50
  try:
42
51
  re.compile(pattern)
@@ -44,32 +53,52 @@ class ColumnCleaner:
44
53
  raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
45
54
 
46
55
  self.rules = rules
56
+ self.case_insensitive = case_insensitive
47
57
 
48
58
  def clean(self, series: pd.Series) -> pd.Series:
49
59
  """
50
- Applies the standardization rules to the provided Series (requires string data).
60
+ Applies the standardization rules sequentially to the provided Series.
51
61
 
52
- Non-matching values are kept as they are.
53
-
54
62
  Args:
55
63
  series (pd.Series): The pandas Series to clean.
56
64
 
57
65
  Returns:
58
- pd.Series: A new Series with the values cleaned and standardized.
66
+ pd.Series: A new Series with the regex replacements applied.
59
67
  """
60
- return series.astype(str).replace(self.rules, regex=True)
68
+ cleaned_series = series.astype(str)
69
+
70
+ # Set the regex flags based on the case_insensitive setting
71
+ flags = re.IGNORECASE if self.case_insensitive else 0
72
+
73
+ # Sequentially apply each regex rule
74
+ for pattern, replacement in self.rules.items():
75
+ cleaned_series = cleaned_series.str.replace(
76
+ pattern,
77
+ replacement,
78
+ regex=True,
79
+ flags=flags
80
+ )
81
+
82
+ return cleaned_series
61
83
 
62
84
 
63
85
  class DataFrameCleaner:
64
86
  """
65
87
  Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
88
+
89
+ Chosen case-sensitivity is applied to all columns.
90
+
91
+ Notes:
92
+ - Write separate, specific rules for each case. Don't combine patterns with an "OR".
93
+ - Define rules from most specific to more general to create a fallback system.
94
+ - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
66
95
 
67
96
  Args:
68
97
  rules (Dict[str, Dict[str, str]]):
69
98
  A nested dictionary where each top-level key is a column name,
70
99
  and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
71
100
  """
72
- def __init__(self, rules: Dict[str, Dict[str, str]]):
101
+ def __init__(self, rules: Dict[str, Dict[str, str]], case_insensitive: bool = True):
73
102
  if not isinstance(rules, dict):
74
103
  raise TypeError("The 'rules' argument must be a nested dictionary.")
75
104
 
@@ -81,6 +110,7 @@ class DataFrameCleaner:
81
110
  )
82
111
 
83
112
  self.rules = rules
113
+ self.case_insensitive = case_insensitive
84
114
 
85
115
  def clean(self, df: pd.DataFrame) -> pd.DataFrame:
86
116
  """
@@ -109,13 +139,13 @@ class DataFrameCleaner:
109
139
 
110
140
  for column_name, column_rules in self.rules.items():
111
141
  # Create and apply the specific cleaner for the column
112
- cleaner = ColumnCleaner(rules=column_rules)
142
+ cleaner = ColumnCleaner(rules=column_rules, case_insensitive=self.case_insensitive)
113
143
  df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
114
144
 
115
145
  return df_cleaned
116
146
 
117
147
 
118
- ############ TRANSFORM ####################
148
+ ############ TRANSFORM MAIN ####################
119
149
 
120
150
  # Magic word for rename-only transformation
121
151
  _RENAME = "rename"
@@ -300,6 +330,75 @@ class DataProcessor:
300
330
  """
301
331
  print(self)
302
332
 
333
+ ############ TRANSFORMERS ####################
334
+
335
+ class BinaryTransformer:
336
+ """
337
+ A transformer that maps string values to a binary 1 or 0 based on keyword matching.
338
+
339
+ Must supply a list of keywords for either the 'true' case (1) or the 'false' case (0), but not both.
340
+
341
+ Args:
342
+ true_keywords (List[str] | None):
343
+ If a string contains any of these keywords, the output is 1, otherwise 0.
344
+ false_keywords (List[str] | None):
345
+ If a string contains any of these keywords, the output is 0, otherwise 1.
346
+ """
347
+ def __init__(
348
+ self,
349
+ true_keywords: Optional[List[str]] = None,
350
+ false_keywords: Optional[List[str]] = None,
351
+ case_insensitive: bool = True,
352
+ ):
353
+ # --- Validation: Enforce one and only one option ---
354
+ if true_keywords is not None and false_keywords is not None:
355
+ raise ValueError(
356
+ "Provide either 'true_keywords' or 'false_keywords', but not both."
357
+ )
358
+ if true_keywords is None and false_keywords is None:
359
+ raise ValueError(
360
+ "You must provide either 'true_keywords' or 'false_keywords'."
361
+ )
362
+
363
+ # --- Configuration ---
364
+ self.keywords: List[str] = true_keywords if true_keywords is not None else false_keywords # type: ignore
365
+ if not self.keywords:
366
+ raise ValueError("Keyword list cannot be empty.")
367
+
368
+ self.mode: str = "true_mode" if true_keywords is not None else "false_mode"
369
+
370
+ # --- Create the regex string pattern ---
371
+ # Escape keywords to treat them as literals
372
+ base_pattern = "|".join(re.escape(k) for k in self.keywords)
373
+
374
+ # For polars, add case-insensitivity flag `(?i)` to the pattern string itself
375
+ if case_insensitive:
376
+ self.pattern = f"(?i){base_pattern}"
377
+ else:
378
+ self.pattern = base_pattern
379
+
380
+
381
+ def __call__(self, column: pl.Series) -> pl.Series:
382
+ """
383
+ Applies the binary mapping logic to the input column.
384
+
385
+ Args:
386
+ column (pl.Series): The input Polars Series of string data.
387
+
388
+ Returns:
389
+ pl.Series: A new Series of type UInt8 containing 1s and 0s.
390
+ """
391
+ # Create a boolean Series: True if any keyword is found, else False
392
+ contains_keyword = column.str.contains(self.pattern)
393
+
394
+ # Apply logic and cast directly to integer type
395
+ if self.mode == "true_mode":
396
+ # True -> 1, False -> 0
397
+ return contains_keyword.cast(pl.UInt8)
398
+ else: # false_mode
399
+ # We want the inverse: True -> 0, False -> 1
400
+ return (~contains_keyword).cast(pl.UInt8)
401
+
303
402
 
304
403
  class KeywordDummifier:
305
404
  """
@@ -316,13 +415,16 @@ class KeywordDummifier:
316
415
  group_keywords (List[List[str]]):
317
416
  A list of lists of strings. Each inner list corresponds to a
318
417
  `group_name` at the same index and contains the keywords to search for.
418
+ case_insensitive (bool):
419
+ If True, keyword matching ignores case.
319
420
  """
320
- def __init__(self, group_names: List[str], group_keywords: List[List[str]]):
421
+ def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True):
321
422
  if len(group_names) != len(group_keywords):
322
423
  raise ValueError("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
323
424
 
324
425
  self.group_names = group_names
325
426
  self.group_keywords = group_keywords
427
+ self.case_insensitive = case_insensitive
326
428
 
327
429
  def __call__(self, column: pl.Series) -> pl.DataFrame:
328
430
  """
@@ -336,9 +438,18 @@ class KeywordDummifier:
336
438
  """
337
439
  column = column.cast(pl.Utf8)
338
440
 
339
- categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None))
441
+ categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None, dtype=pl.Utf8))
442
+
340
443
  for name, keywords in zip(self.group_names, self.group_keywords):
341
- pattern = "|".join(re.escape(k) for k in keywords)
444
+ # Create the base regex pattern by escaping and joining keywords
445
+ base_pattern = "|".join(re.escape(k) for k in keywords)
446
+
447
+ # Add the case-insensitive flag `(?i)` to the pattern string
448
+ if self.case_insensitive:
449
+ pattern = f"(?i){base_pattern}"
450
+ else:
451
+ pattern = base_pattern
452
+
342
453
  categorize_expr = categorize_expr.when(
343
454
  column.str.contains(pattern)
344
455
  ).then(pl.lit(name))
@@ -357,6 +468,7 @@ class KeywordDummifier:
357
468
  df_with_dummies.get_column(dummy_col_name).alias(name)
358
469
  )
359
470
  else:
471
+ # If a group had no matches, create a column of zeros
360
472
  final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
361
473
 
362
474
  return pl.DataFrame(final_columns)
@@ -632,33 +744,42 @@ class RegexMapper:
632
744
  "first match wins" logic makes the order of the mapping important.
633
745
 
634
746
  Args:
635
- mapping (Dict[str, Union[int, float]]):
747
+ mapping (Dict[str, [int | float]]):
636
748
  An ordered dictionary where keys are regex patterns and values are
637
749
  the numbers to map to if the pattern is found.
638
- unseen_value (Optional[Union[int, float]], optional):
750
+ unseen_value (int | float | None):
639
751
  The numerical value to use for strings that do not match any
640
- of the regex patterns. If None (default), unseen values are
641
- mapped to null.
752
+ of the regex patterns. If None, unseen values are mapped to null.
753
+ case_insensitive (bool):
754
+ If True , the regex matching for all patterns will ignore case.
642
755
  """
643
756
  def __init__(
644
757
  self,
645
758
  mapping: Dict[str, Union[int, float]],
646
759
  unseen_value: Optional[Union[int, float]] = None,
760
+ case_insensitive: bool = True,
647
761
  ):
648
762
  # --- Validation ---
649
763
  if not isinstance(mapping, dict):
650
764
  raise TypeError("The 'mapping' argument must be a dictionary.")
765
+
766
+ self.unseen_value = unseen_value
651
767
 
768
+ # --- Process and validate patterns ---
769
+ # Process patterns here to be more efficient, avoiding reprocessing on every __call__.
770
+ self.processed_mapping: List[Tuple[str, Union[int, float]]] = []
652
771
  for pattern, value in mapping.items():
772
+ final_pattern = f"(?i){pattern}" if case_insensitive else pattern
773
+
774
+ # Validate the final pattern that will actually be used by Polars
653
775
  try:
654
- re.compile(pattern)
776
+ re.compile(final_pattern)
655
777
  except re.error as e:
656
- raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
778
+ raise ValueError(f"Invalid regex pattern '{final_pattern}': {e}") from e
657
779
  if not isinstance(value, (int, float)):
658
780
  raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
659
-
660
- self.mapping = mapping
661
- self.unseen_value = unseen_value
781
+
782
+ self.processed_mapping.append((final_pattern, value))
662
783
 
663
784
  def __call__(self, column: pl.Series) -> pl.Series:
664
785
  """
@@ -671,22 +792,20 @@ class RegexMapper:
671
792
  pl.Series: A new Series with strings mapped to numbers based on
672
793
  the first matching regex pattern.
673
794
  """
674
- # Ensure the column is treated as a string for matching
675
- str_column = column.cast(pl.Utf8)
795
+ # pl.String is the modern alias for pl.Utf8
796
+ str_column = column.cast(pl.String)
676
797
 
677
- # Build the when/then/otherwise chain from the inside out.
678
- # Start with the final fallback value for non-matches.
798
+ # Start with the fallback value for non-matches.
679
799
  mapping_expr = pl.lit(self.unseen_value)
680
800
 
681
- # Iterate through the mapping in reverse to construct the nested expression
682
- for pattern, value in reversed(list(self.mapping.items())):
801
+ # Iterate through the processed mapping in reverse to construct the nested expression
802
+ for pattern, value in reversed(self.processed_mapping):
683
803
  mapping_expr = (
684
804
  pl.when(str_column.str.contains(pattern))
685
805
  .then(pl.lit(value))
686
806
  .otherwise(mapping_expr)
687
807
  )
688
808
 
689
- # Execute the complete expression chain and return the resulting Series
690
809
  return pl.select(mapping_expr).to_series()
691
810
 
692
811
 
@@ -587,14 +587,14 @@ def standardize_percentages(
587
587
  Standardizes numeric columns containing mixed-format percentages.
588
588
 
589
589
  This function cleans columns where percentages might be entered as whole
590
- numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
590
+ numbers (55) and as proportions (0.55). It assumes values
591
591
  between 0 and 1 are proportions and multiplies them by 100.
592
592
 
593
593
  Args:
594
594
  df (pd.Dataframe): The input pandas DataFrame.
595
595
  columns (list[str]): A list of column names to standardize.
596
596
  treat_one_as_proportion (bool):
597
- - If True (default): The value `1` is treated as a proportion and converted to `100`.
597
+ - If True (default): The value `1` is treated as a proportion and converted to `100%`.
598
598
  - If False: The value `1` is treated as `1%`.
599
599
  round_digits (int): The number of decimal places to round the final result to.
600
600
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "3.0.0"
3
+ version = "3.2.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }