dragon-ml-toolbox 8.0.0__py3-none-any.whl → 8.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 8.0.0
3
+ Version: 8.2.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
- dragon_ml_toolbox-8.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-8.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
- ml_tools/ETL_engineering.py,sha256=4wwZXi9_U7xfCY70jGBaKniOeZ0m75ppxWpQBd_DmLc,39369
1
+ dragon_ml_toolbox-8.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-8.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
+ ml_tools/ETL_engineering.py,sha256=69YGK4fN5ouRBknTvU4uZ8KLQGT-hPrvwymH-IygEnk,40911
4
4
  ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
5
5
  ml_tools/MICE_imputation.py,sha256=oFHg-OytOzPYTzBR_wIRHhP71cMn3aupDeT59ABsXlQ,11576
6
6
  ml_tools/ML_callbacks.py,sha256=noedVMmHZ72Odbg28zqx5wkhhvX2v-jXicKE_NCAiqU,13838
@@ -21,7 +21,7 @@ ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
21
21
  ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
22
22
  ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
23
23
  ml_tools/custom_logger.py,sha256=nyLRxaRxkqYOFdSjI0X2BWXB8C2IU18QfmqIFKqSedI,5820
24
- ml_tools/data_exploration.py,sha256=P4f8OpRa7Q4i-11nkppxXw5Lx2lwlpn20GwWBbN_xbM,23901
24
+ ml_tools/data_exploration.py,sha256=RuMHWagXrSQi1MzAMlYeBeVg7UxhVvEq8gJ9bIam2BM,27103
25
25
  ml_tools/ensemble_evaluation.py,sha256=wnqoTPg4WYWf2A8z5XT0eSlW4snEuLCXQVj88sZKzQ4,24683
26
26
  ml_tools/ensemble_inference.py,sha256=rtU7eUaQne615n2g7IHZCJI-OvrBCcjxbTkEIvtCGFQ,9414
27
27
  ml_tools/ensemble_learning.py,sha256=dAyFgSTyvxJWjc_enJ_8EUoWwiekBeoNyJNxVY-kcUU,21868
@@ -30,7 +30,7 @@ ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
30
30
  ml_tools/optimization_tools.py,sha256=EL5tgNFwRo-82pbRE1CFVy9noNhULD7wprWuKadPheg,5090
31
31
  ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
32
32
  ml_tools/utilities.py,sha256=LqXXTovaHbA5AOKRk6Ru6DgAPAM0wPfYU70kUjYBryo,19231
33
- dragon_ml_toolbox-8.0.0.dist-info/METADATA,sha256=sUJ-tiQBxu_emCNMFYDd762_a9Cpot4pWGhl2J8dXBE,6778
34
- dragon_ml_toolbox-8.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
- dragon_ml_toolbox-8.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
36
- dragon_ml_toolbox-8.0.0.dist-info/RECORD,,
33
+ dragon_ml_toolbox-8.2.0.dist-info/METADATA,sha256=C1rjTnTNSj6VI2khy7Xl1VjQ__MP6-b43x9RIQCHY3E,6778
34
+ dragon_ml_toolbox-8.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
+ dragon_ml_toolbox-8.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
36
+ dragon_ml_toolbox-8.2.0.dist-info/RECORD,,
@@ -3,7 +3,6 @@ import re
3
3
  from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
4
4
  from ._script_info import _script_info
5
5
  from ._logger import _LOGGER
6
- import warnings
7
6
 
8
7
 
9
8
  __all__ = [
@@ -13,6 +12,7 @@ __all__ = [
13
12
  "DataProcessor",
14
13
  "BinaryTransformer",
15
14
  "MultiBinaryDummifier",
15
+ "AutoDummifier",
16
16
  "KeywordDummifier",
17
17
  "NumberExtractor",
18
18
  "MultiNumberExtractor",
@@ -277,16 +277,32 @@ class DataProcessor:
277
277
  processed_columns.append(result.alias(output_col_spec))
278
278
 
279
279
  elif isinstance(result, pl.DataFrame):
280
- if not isinstance(output_col_spec, list):
281
- raise TypeError(f"Function for '{input_col_name}' returned a DataFrame but 'output_col' is not a list.")
282
- if len(result.columns) != len(output_col_spec):
283
- raise ValueError(
284
- f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, "
285
- f"but recipe specifies {len(output_col_spec)} output names."
286
- )
280
+ # 1. Handle list-based renaming
281
+ if isinstance(output_col_spec, list):
282
+ if len(result.columns) != len(output_col_spec):
283
+ raise ValueError(
284
+ f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, "
285
+ f"but recipe specifies {len(output_col_spec)} output names."
286
+ )
287
+
288
+ renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
289
+ processed_columns.extend(renamed_df.get_columns())
290
+
291
+ # 2. Handle a string prefix for AutoDummifier
292
+ elif isinstance(output_col_spec, str):
293
+ prefix = output_col_spec
294
+ # Replace the original name part with the desired prefix.
295
+ new_names = {
296
+ col: f"{prefix}{col[len(input_col_name):]}" for col in result.columns
297
+ }
298
+ renamed_df = result.rename(new_names)
299
+ processed_columns.extend(renamed_df.get_columns())
287
300
 
288
- renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
289
- processed_columns.extend(renamed_df.get_columns())
301
+ else:
302
+ raise TypeError(
303
+ f"Function for '{input_col_name}' returned a DataFrame, "
304
+ f"so 'output_col' must be a list of names or a string prefix."
305
+ )
290
306
 
291
307
  else:
292
308
  raise TypeError(f"Function for '{input_col_name}' returned an unexpected type: {type(result)}.")
@@ -413,6 +429,27 @@ class BinaryTransformer:
413
429
  return (~contains_keyword).cast(pl.UInt8)
414
430
 
415
431
 
432
+ class AutoDummifier:
433
+ """
434
+ A transformer that performs one-hot encoding on a categorical column,
435
+ automatically detecting the unique categories from the data.
436
+ """
437
+ def __call__(self, column: pl.Series) -> pl.DataFrame:
438
+ """
439
+ Executes the one-hot encoding logic.
440
+
441
+ Args:
442
+ column (pl.Series): The input Polars Series of categories.
443
+
444
+ Returns:
445
+ pl.DataFrame: A DataFrame with one-hot encoded columns.
446
+ Column names are auto-generated by Polars as
447
+ '{original_col_name}_{category_value}'.
448
+ """
449
+ # Ensure the column is treated as a string before creating dummies
450
+ return column.cast(pl.Utf8).to_dummies()
451
+
452
+
416
453
  class MultiBinaryDummifier:
417
454
  """
418
455
  A one-to-many transformer that creates multiple binary columns from a single
@@ -5,10 +5,12 @@ import matplotlib.pyplot as plt
5
5
  import seaborn as sns
6
6
  from typing import Union, Literal, Dict, Tuple, List, Optional
7
7
  from pathlib import Path
8
+ import re
9
+
8
10
  from .path_manager import sanitize_filename, make_fullpath
9
11
  from ._script_info import _script_info
10
12
  from ._logger import _LOGGER
11
- import re
13
+ from .utilities import save_dataframe
12
14
 
13
15
 
14
16
  # Keep track of all available tools, show using `info()`
@@ -18,6 +20,7 @@ __all__ = [
18
20
  "drop_rows_with_missing_data",
19
21
  "show_null_columns",
20
22
  "drop_columns_with_missing_data",
23
+ "drop_macro",
21
24
  "split_features_targets",
22
25
  "split_continuous_binary",
23
26
  "plot_correlation_heatmap",
@@ -155,7 +158,7 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
155
158
 
156
159
  def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
157
160
  """
158
- Displays a table of columns with missing values, showing both the count and
161
+ Returns a table of columns with missing values, showing both the count and
159
162
  percentage of missing entries per column.
160
163
 
161
164
  Parameters:
@@ -221,6 +224,81 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
221
224
  return df
222
225
 
223
226
 
227
+ def drop_macro(df: pd.DataFrame,
228
+ log_directory: Union[str,Path],
229
+ targets: list[str],
230
+ skip_targets: bool=False,
231
+ threshold: float=0.7) -> pd.DataFrame:
232
+ """
233
+ Iteratively removes rows and columns with excessive missing data.
234
+
235
+ This function performs a comprehensive cleaning cycle on a DataFrame. It
236
+ repeatedly drops columns with constant values, followed by rows and columns that exceed
237
+ a specified threshold of missing values. The process continues until the
238
+ DataFrame's dimensions stabilize, ensuring that the interdependency between
239
+ row and column deletions is handled.
240
+
241
+ Initial and final missing data reports are saved to the specified log directory.
242
+
243
+ Args:
244
+ df (pd.DataFrame): The input pandas DataFrame to be cleaned.
245
+ log_directory (Union[str, Path]): Path to the directory where the
246
+ 'Missing_Data_start.csv' and 'Missing_Data_final.csv' logs
247
+ will be saved.
248
+ targets (list[str]): A list of column names to be treated as target
249
+ variables. This list guides the row-dropping logic.
250
+ skip_targets (bool, optional): If True, the columns listed in `targets`
251
+ will be exempt from being dropped, even if they exceed the missing
252
+ data threshold.
253
+ threshold (float, optional): The proportion of missing data required to drop
254
+ a row or column. For example, 0.7 means a row/column will be
255
+ dropped if 70% or more of its data is missing.
256
+
257
+ Returns:
258
+ pd.DataFrame: A new, cleaned DataFrame with offending rows and columns removed.
259
+ """
260
+ # make a deep copy to work with
261
+ df_clean = df.copy()
262
+
263
+ # Log initial state
264
+ missing_data = show_null_columns(df=df_clean)
265
+ save_dataframe(df=missing_data.reset_index(drop=False),
266
+ save_dir=log_directory,
267
+ filename="Missing_Data_start")
268
+
269
+ # Clean cycles for rows and columns
270
+ master = True
271
+ while master:
272
+ # track rows and columns
273
+ initial_rows, initial_columns = df_clean.shape
274
+
275
+ # drop constant columns
276
+ df_clean = drop_constant_columns(df=df_clean)
277
+
278
+ # clean rows
279
+ df_clean = drop_rows_with_missing_data(df=df_clean, targets=targets, threshold=threshold)
280
+
281
+ # clean columns
282
+ if skip_targets:
283
+ df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False, skip_columns=targets)
284
+ else:
285
+ df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False)
286
+
287
+ # cleaned?
288
+ remaining_rows, remaining_columns = df_clean.shape
289
+ if remaining_rows >= initial_rows and remaining_columns >= initial_columns:
290
+ master = False
291
+
292
+ # log final state
293
+ missing_data = show_null_columns(df=df_clean)
294
+ save_dataframe(df=missing_data.reset_index(drop=False),
295
+ save_dir=log_directory,
296
+ filename="Missing_Data_final")
297
+
298
+ # return cleaned dataframe
299
+ return df_clean
300
+
301
+
224
302
  def split_features_targets(df: pd.DataFrame, targets: list[str]):
225
303
  """
226
304
  Splits a DataFrame's columns into features and targets.