dragon-ml-toolbox 3.2.0__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.2.0
3
+ Version: 3.3.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
- dragon_ml_toolbox-3.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-3.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
- ml_tools/ETL_engineering.py,sha256=xbGti0XS84l1UWQ6y3mP-VV_4DfaRBoz8-B3bTgiCUE,36690
1
+ dragon_ml_toolbox-3.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-3.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
+ ml_tools/ETL_engineering.py,sha256=TL5sz7ac4uwa993Uy6Ex11-9vN_t5IaD-Y9UjyI88_o,39030
4
4
  ml_tools/GUI_tools.py,sha256=uFx6zIrQZzDPSTtOSHz8ptz-fxZiQz-lXHcrqwuYV_E,20385
5
5
  ml_tools/MICE_imputation.py,sha256=ed-YeQkEAeHxTNkWIHs09T4YeYNF0aqAnrUTcdIEp9E,11372
6
6
  ml_tools/ML_callbacks.py,sha256=gHZk-lyzAax6iEtG26zHuoobdAZCFJ6BmI6pWoXkOrw,13189
@@ -13,13 +13,13 @@ ml_tools/VIF_factor.py,sha256=5GVAldH69Vkei3WRUZN1uPBMzGoOOeEOA-bgmZXbbUw,10301
13
13
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
15
15
  ml_tools/_pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
16
- ml_tools/data_exploration.py,sha256=bOcCoQLeDPFJ7nB5Fsi16lzB22cG-c-mxObMsTetgS4,23655
16
+ ml_tools/data_exploration.py,sha256=IkpOyIRPKdu4qKeUdvvMvNPelSVWegNEKMqX3IInmpw,25003
17
17
  ml_tools/datasetmaster.py,sha256=N-uwfzWnl_qnoAqjbfS98I1pVNra5u6rhKLdWbFIReA,30122
18
18
  ml_tools/ensemble_learning.py,sha256=PPtBBLgLvaYOdY-MlcjXuxWWXf3JQavLNEysFgzjc_s,37470
19
19
  ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
20
20
  ml_tools/logger.py,sha256=jC4Q2OqmDm8ZO9VpuZqBSWdXryqaJvLscqVJ6caNMOk,6009
21
21
  ml_tools/utilities.py,sha256=opNR-ACH6BnLkWAKcb19ef5tFxfx22TI6E2o0RYwiGA,21021
22
- dragon_ml_toolbox-3.2.0.dist-info/METADATA,sha256=QLfQmo-eUJFkFOKbbZzZKMZ5SARrrcbhfkffrPkMHvc,3273
23
- dragon_ml_toolbox-3.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
- dragon_ml_toolbox-3.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
25
- dragon_ml_toolbox-3.2.0.dist-info/RECORD,,
22
+ dragon_ml_toolbox-3.3.0.dist-info/METADATA,sha256=rIwDGwx-RQ1XvVe0TLA8YkQIs71W7MoQJHfx28l580M,3273
23
+ dragon_ml_toolbox-3.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
+ dragon_ml_toolbox-3.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
25
+ dragon_ml_toolbox-3.3.0.dist-info/RECORD,,
@@ -12,6 +12,7 @@ __all__ = [
12
12
  "TransformationRecipe",
13
13
  "DataProcessor",
14
14
  "BinaryTransformer",
15
+ "MultiBinaryDummifier",
15
16
  "KeywordDummifier",
16
17
  "NumberExtractor",
17
18
  "MultiNumberExtractor",
@@ -400,12 +401,72 @@ class BinaryTransformer:
400
401
  return (~contains_keyword).cast(pl.UInt8)
401
402
 
402
403
 
404
+ class MultiBinaryDummifier:
405
+ """
406
+ A one-to-many transformer that creates multiple binary columns from a single
407
+ text column based on a list of keywords.
408
+
409
+ For each keyword provided, this transformer generates a corresponding column
410
+ with a value of 1 if the keyword is present in the input string, and 0 otherwise.
411
+ It is designed to be used within the DataProcessor pipeline.
412
+
413
+ Args:
414
+ keywords (List[str]):
415
+ A list of strings, where each string is a keyword to search for. A separate
416
+ binary column will be created for each keyword.
417
+ case_insensitive (bool):
418
+ If True, keyword matching ignores case. Defaults to True.
419
+ """
420
+ def __init__(self, keywords: List[str], case_insensitive: bool = True):
421
+ if not isinstance(keywords, list) or not all(isinstance(k, str) for k in keywords):
422
+ raise TypeError("The 'keywords' argument must be a list of strings.")
423
+ if not keywords:
424
+ raise ValueError("The 'keywords' list cannot be empty.")
425
+
426
+ self.keywords = keywords
427
+ self.case_insensitive = case_insensitive
428
+
429
+ def __call__(self, column: pl.Series) -> pl.DataFrame:
430
+ """
431
+ Executes the dummification logic.
432
+
433
+ Args:
434
+ column (pl.Series): The input Polars Series to transform.
435
+
436
+ Returns:
437
+ pl.DataFrame: A DataFrame where each column corresponds to a keyword.
438
+ """
439
+ # Ensure the input is treated as a string, preserving nulls
440
+ str_column = column.cast(pl.Utf8)
441
+
442
+ output_expressions = []
443
+ for i, keyword in enumerate(self.keywords):
444
+ # Escape keyword to treat it as a literal, not a regex pattern
445
+ base_pattern = re.escape(keyword)
446
+
447
+ # Add case-insensitivity flag `(?i)` if needed
448
+ pattern = f"(?i){base_pattern}" if self.case_insensitive else base_pattern
449
+
450
+ # Create the binary expression
451
+ expr = (
452
+ pl.when(str_column.is_null())
453
+ .then(None) # Propagate nulls from original column
454
+ .when(str_column.str.contains(pattern))
455
+ .then(pl.lit(1, dtype=pl.UInt8))
456
+ .otherwise(pl.lit(0, dtype=pl.UInt8))
457
+ .alias(f"col_{i}") # Generic name for DataProcessor
458
+ )
459
+ output_expressions.append(expr)
460
+
461
+ return pl.select(output_expressions)
462
+
463
+
403
464
  class KeywordDummifier:
404
465
  """
405
466
  A configurable transformer that creates one-hot encoded columns based on
406
467
  keyword matching in a Polars Series.
407
468
 
408
- Instantiate this class with keyword configurations. The instance can be used as a 'transform' callable compatible with the `TransformationRecipe`.
469
+ Operates on a "first match wins" principle.
409
470
 
410
471
  Args:
411
472
  group_names (List[str]):
@@ -15,6 +15,7 @@ import re
15
15
  # Keep track of all available tools, show using `info()`
16
16
  __all__ = [
17
17
  "summarize_dataframe",
18
+ "drop_zero_only_columns",
18
19
  "drop_rows_with_missing_data",
19
20
  "split_features_targets",
20
21
  "show_null_columns",
@@ -61,6 +62,47 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
61
62
  return summary
62
63
 
63
64
 
65
+ def drop_zero_only_columns(df: pd.DataFrame, verbose: bool=True) -> pd.DataFrame:
66
+ """
67
+ Removes columns from a pandas DataFrame that contain only zeros and null/NaN values.
68
+
69
+ This utility is useful for cleaning data after dummification steps that may result in empty columns.
70
+
71
+ Args:
72
+ df (pd.DataFrame):
73
+ The pandas DataFrame to clean.
74
+
75
+ Returns:
76
+ pd.DataFrame:
77
+ A new DataFrame with the empty columns removed.
78
+ """
79
+ if not isinstance(df, pd.DataFrame):
80
+ raise TypeError("Input must be a pandas DataFrame.")
81
+
82
+ original_columns = set(df.columns)
83
+
84
+ cols_to_keep = []
85
+ for col_name in df.columns:
86
+ column = df[col_name]
87
+
88
+ # Keep any column that is not numeric by default
89
+ if not is_numeric_dtype(column):
90
+ cols_to_keep.append(col_name)
91
+ continue
92
+
93
+ # For numeric columns, check if there's at least one non-zero value.
94
+ if (column != 0).any():
95
+ cols_to_keep.append(col_name)
96
+
97
+ dropped_columns = original_columns - set(cols_to_keep)
98
+ if dropped_columns and verbose:
99
+ print(f"Dropped {len(dropped_columns)} columns:")
100
+ for dropped_column in dropped_columns:
101
+ print(f" {dropped_column}")
102
+
103
+ return df[cols_to_keep]
104
+
105
+
64
106
  def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
65
107
  """
66
108
  Drops rows from the DataFrame using a two-stage strategy: