dragon-ml-toolbox 4.2.1__py3-none-any.whl → 4.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 4.2.1
3
+ Version: 4.3.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
- dragon_ml_toolbox-4.2.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-4.2.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
- ml_tools/ETL_engineering.py,sha256=rlu0bUekdKREcTR0x1jn_TSEqhxgfq3QU71hy6ZyaD8,39503
1
+ dragon_ml_toolbox-4.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-4.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
+ ml_tools/ETL_engineering.py,sha256=P7HN_e3vfmrOqDDK-IenyRSFQPr0N3V9e2gN75QFVWs,39372
4
4
  ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
5
5
  ml_tools/MICE_imputation.py,sha256=b6ZTs8RedXFifOpuMCzr68xM16mCBVh1Ua6kcGfiVtg,11462
6
6
  ml_tools/ML_callbacks.py,sha256=0a-Rbr0Xp_B1FNopOKBBmuJ4MqazS5JgDiT7wx1dHvE,13161
@@ -16,7 +16,7 @@ ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
16
16
  ml_tools/_pytorch_models.py,sha256=ewPPsTHgmRPzMMWwObZOdH1vxm2Ij2VWZP38NC6zSH4,10135
17
17
  ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
18
18
  ml_tools/custom_logger.py,sha256=a3ywSCQT7j5ypR-usnKh2l861d_aVJ93ZRVqxrHsBBw,4112
19
- ml_tools/data_exploration.py,sha256=rJhvxUqVbEuB_7HG-PfLH3vaA7hrZEtbVHg9QO9VS4A,22837
19
+ ml_tools/data_exploration.py,sha256=T4nO9YSDGvrpom7JELtoQTyg7XTEmvQz-jG0KKxqTRk,23467
20
20
  ml_tools/datasetmaster.py,sha256=_tNC2v98eCQGr3nMW_EFs83TRgRme8Uc7ttg1vosmQU,30106
21
21
  ml_tools/ensemble_inference.py,sha256=0SNX3YAz5bpvtwYmqEwqyWeIJP2Pb-v-bemENRSO7qg,9426
22
22
  ml_tools/ensemble_learning.py,sha256=Zi1oy6G2FWnTI5hBwjlexwF3JKALFS2FN6F8HAlVi_s,35391
@@ -24,7 +24,7 @@ ml_tools/handle_excel.py,sha256=J9iwIqMZemoxK49J5osSwp9Ge0h9YTKyYGbOm53hcno,1300
24
24
  ml_tools/keys.py,sha256=kK9UF-hek2VcPGFILCKl5geoN6flmMOu7IzhdEA6z5Y,1068
25
25
  ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
26
26
  ml_tools/utilities.py,sha256=mz-M351DzxWxnYVcLX-7ZQ6c-RGoCV9g4VTS9Qif2Es,18348
27
- dragon_ml_toolbox-4.2.1.dist-info/METADATA,sha256=mzW1BLOxrCKZAoZgqzYRcNhHpO4fTNxDGvUwuF5wG88,6572
28
- dragon_ml_toolbox-4.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
- dragon_ml_toolbox-4.2.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
30
- dragon_ml_toolbox-4.2.1.dist-info/RECORD,,
27
+ dragon_ml_toolbox-4.3.0.dist-info/METADATA,sha256=7aZO_5P8SDx4tPFTtb3MTAaRgf_vbcOEURaxpT3MGK8,6572
28
+ dragon_ml_toolbox-4.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
+ dragon_ml_toolbox-4.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
30
+ dragon_ml_toolbox-4.3.0.dist-info/RECORD,,
@@ -3,6 +3,7 @@ import re
3
3
  from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
4
4
  from ._script_info import _script_info
5
5
  from ._logger import _LOGGER
6
+ import warnings
6
7
 
7
8
 
8
9
  __all__ = [
@@ -50,7 +51,7 @@ class ColumnCleaner:
50
51
  ```python
51
52
  id_rules = {
52
53
  # Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
53
- r'ID[- ](\d+)': r'ID:$1'
54
+ r'ID[- ](\\d+)': r'ID:$1'
54
55
  }
55
56
 
56
57
  id_cleaner = ColumnCleaner(column_name='user_id', rules=id_rules)
@@ -700,26 +701,28 @@ class MultiNumberExtractor:
700
701
 
701
702
  class RatioCalculator:
702
703
  """
703
- A transformer that parses a string ratio (e.g., "40:5" or "30/2") and computes the result of the division.
704
-
705
- Args:
706
- regex_pattern (str, optional):
707
- The regex pattern to find the numerator and denominator. It MUST
708
- contain exactly two capturing groups: the first for the
709
- numerator and the second for the denominator. Defaults to a
710
- pattern that handles common delimiters like ':' and '/'.
704
+ A transformer that parses a string ratio (e.g., "40:5" or "30/2") and
705
+ computes the result of the division. It gracefully handles strings that
706
+ do not match the pattern by returning null.
711
707
  """
712
708
  def __init__(
713
709
  self,
714
- regex_pattern: str = r"(\d+\.?\d*)\s*[:/]\s*(\d+\.?\d*)"
710
+ # Default pattern includes the full-width colon ':'
711
+ regex_pattern: str = r"(\d+\.?\d*)\s*[::/]\s*(\d+\.?\d*)"
715
712
  ):
716
- # --- Validation ---
713
+ # --- Robust Validation ---
717
714
  try:
718
- if re.compile(regex_pattern).groups != 2:
715
+ compiled_pattern = re.compile(regex_pattern)
716
+ if compiled_pattern.groups != 2:
719
717
  raise ValueError(
720
- "regex_pattern must contain exactly two "
718
+ "RatioCalculator regex_pattern must contain exactly two "
721
719
  "capturing groups '(...)'."
722
720
  )
721
+ if compiled_pattern.groupindex:
722
+ raise ValueError(
723
+ "RatioCalculator must be initialized with unnamed capturing groups "
724
+ "(e.g., '(\\d+)'), not named groups (e.g., '(?P<name>\\d+)')."
725
+ )
723
726
  except re.error as e:
724
727
  raise ValueError(f"Invalid regex pattern provided: {e}") from e
725
728
 
@@ -728,27 +731,20 @@ class RatioCalculator:
728
731
  def __call__(self, column: pl.Series) -> pl.Series:
729
732
  """
730
733
  Applies the ratio calculation logic to the input column.
731
-
732
- Args:
733
- column (pl.Series): The input Polars Series of ratio strings.
734
-
735
- Returns:
736
- pl.Series: A new Series of floats containing the division result.
737
- Returns null for invalid formats or division by zero.
734
+ This version uses .str.extract() for maximum stability.
738
735
  """
739
- # .extract_groups returns a struct with a field for each capture group
740
- # e.g., {"group_1": "40", "group_2": "5"}
741
- groups = column.str.extract_groups(self.regex_pattern)
736
+ # Extract numerator (group 1) and denominator (group 2) separately.
737
+ numerator_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
738
+ denominator_expr = column.str.extract(self.regex_pattern, 2).cast(pl.Float64, strict=False)
742
739
 
743
- # Extract numerator and denominator, casting to float
744
- # strict=False ensures that non-matches become null
745
- numerator = groups.struct.field("group_1").cast(pl.Float64, strict=False)
746
- denominator = groups.struct.field("group_2").cast(pl.Float64, strict=False)
740
+ # Calculate the ratio, handling division by zero.
741
+ final_expr = pl.when(denominator_expr != 0).then(
742
+ numerator_expr / denominator_expr
743
+ ).otherwise(
744
+ None # Handles both null denominators and division by zero
745
+ )
747
746
 
748
- # Safely perform division, returning null if denominator is 0
749
- final_expr = pl.when(denominator != 0).then(numerator / denominator).otherwise(None)
750
-
751
- return pl.select(final_expr).to_series()
747
+ return pl.select(final_expr.round(4)).to_series()
752
748
 
753
749
 
754
750
  class CategoryMapper:
@@ -15,9 +15,9 @@ __all__ = [
15
15
  "summarize_dataframe",
16
16
  "drop_constant_columns",
17
17
  "drop_rows_with_missing_data",
18
- "split_features_targets",
19
18
  "show_null_columns",
20
19
  "drop_columns_with_missing_data",
20
+ "split_features_targets",
21
21
  "split_continuous_binary",
22
22
  "plot_correlation_heatmap",
23
23
  "plot_value_distributions",
@@ -125,7 +125,9 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
125
125
 
126
126
  # Stage 1: Drop rows with all target columns missing
127
127
  if targets is not None:
128
- target_na = df_clean[targets].isnull().all(axis=1)
128
+ # validate targets
129
+ valid_targets = [target for target in targets if target in df_clean.columns]
130
+ target_na = df_clean[valid_targets].isnull().all(axis=1)
129
131
  if target_na.any():
130
132
  print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
131
133
  df_clean = df_clean[~target_na]
@@ -150,30 +152,6 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
150
152
  return df_clean
151
153
 
152
154
 
153
- def split_features_targets(df: pd.DataFrame, targets: list[str]):
154
- """
155
- Splits a DataFrame's columns into features and targets.
156
-
157
- Args:
158
- df (pd.DataFrame): Pandas DataFrame containing the dataset.
159
- targets (list[str]): List of column names to be treated as target variables.
160
-
161
- Returns:
162
- tuple: A tuple containing:
163
- - pd.DataFrame: Features dataframe.
164
- - pd.DataFrame: Targets dataframe.
165
-
166
- Prints:
167
- - Shape of the original dataframe.
168
- - Shape of the features dataframe.
169
- - Shape of the targets dataframe.
170
- """
171
- df_targets = df[targets]
172
- df_features = df.drop(columns=targets)
173
- print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
174
- return df_features, df_targets
175
-
176
-
177
155
  def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
178
156
  """
179
157
  Displays a table of columns with missing values, showing both the count and
@@ -202,7 +180,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
202
180
  return null_summary
203
181
 
204
182
 
205
- def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
183
+ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True, skip_columns: Optional[List[str]]=None) -> pd.DataFrame:
206
184
  """
207
185
  Drops columns with more than `threshold` fraction of missing values.
208
186
 
@@ -210,11 +188,22 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
210
188
  df (pd.DataFrame): The input DataFrame.
211
189
  threshold (float): Fraction of missing values above which columns are dropped.
212
190
  show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
191
+ skip_columns (list[str] | None): If given, these columns wont be included in the drop process.
213
192
 
214
193
  Returns:
215
194
  pd.DataFrame: A new DataFrame without the dropped columns.
216
195
  """
217
- missing_fraction = df.isnull().mean()
196
+ # If skip_columns is provided, create a list of columns to check.
197
+ # Otherwise, check all columns.
198
+ cols_to_check = df.columns
199
+ if skip_columns:
200
+ # Use set difference for efficient exclusion
201
+ cols_to_check = df.columns.difference(skip_columns)
202
+
203
+ # Calculate the missing fraction only on the columns to be checked
204
+ missing_fraction = df[cols_to_check].isnull().mean()
205
+
206
+
218
207
  cols_to_drop = missing_fraction[missing_fraction > threshold].index
219
208
 
220
209
  if len(cols_to_drop) > 0:
@@ -231,6 +220,30 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
231
220
  return df
232
221
 
233
222
 
223
+ def split_features_targets(df: pd.DataFrame, targets: list[str]):
224
+ """
225
+ Splits a DataFrame's columns into features and targets.
226
+
227
+ Args:
228
+ df (pd.DataFrame): Pandas DataFrame containing the dataset.
229
+ targets (list[str]): List of column names to be treated as target variables.
230
+
231
+ Returns:
232
+ tuple: A tuple containing:
233
+ - pd.DataFrame: Features dataframe.
234
+ - pd.DataFrame: Targets dataframe.
235
+
236
+ Prints:
237
+ - Shape of the original dataframe.
238
+ - Shape of the features dataframe.
239
+ - Shape of the targets dataframe.
240
+ """
241
+ df_targets = df[targets]
242
+ df_features = df.drop(columns=targets)
243
+ print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
244
+ return df_features, df_targets
245
+
246
+
234
247
  def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
235
248
  """
236
249
  Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.