dragon-ml-toolbox 19.12.2__tar.gz → 19.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. {dragon_ml_toolbox-19.12.2/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-19.13.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_data_exploration.py +35 -60
  4. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/pyproject.toml +1 -1
  5. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/LICENSE +0 -0
  6. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/LICENSE-THIRD-PARTY.md +0 -0
  7. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/README.md +0 -0
  8. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  9. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  10. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  11. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  12. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ETL_cleaning.py +0 -0
  13. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ETL_engineering.py +0 -0
  14. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/GUI_tools.py +0 -0
  15. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/IO_tools.py +0 -0
  16. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/MICE_imputation.py +0 -0
  17. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_callbacks.py +0 -0
  18. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_chaining_inference.py +0 -0
  19. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_chaining_utilities.py +0 -0
  20. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_configuration.py +0 -0
  21. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_configuration_pytab.py +0 -0
  22. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_datasetmaster.py +0 -0
  23. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_evaluation.py +0 -0
  24. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_evaluation_captum.py +0 -0
  25. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_evaluation_multi.py +0 -0
  26. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_finalize_handler.py +0 -0
  27. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_inference.py +0 -0
  28. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_models.py +0 -0
  29. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_models_advanced.py +0 -0
  30. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_models_pytab.py +0 -0
  31. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_optimization.py +0 -0
  32. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_optimization_pareto.py +0 -0
  33. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_scaler.py +0 -0
  34. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_datasetmaster.py +0 -0
  35. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_evaluation.py +0 -0
  36. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_inference.py +0 -0
  37. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_models.py +0 -0
  38. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_trainer.py +0 -0
  39. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_utilities.py +0 -0
  40. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_datasetmaster.py +0 -0
  41. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_evaluation.py +0 -0
  42. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_inference.py +0 -0
  43. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_models.py +0 -0
  44. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_transformers.py +0 -0
  45. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/PSO_optimization.py +0 -0
  46. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/SQL.py +0 -0
  47. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/VIF_factor.py +0 -0
  48. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/__init__.py +0 -0
  49. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ETL_cleaning.py +0 -0
  50. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ETL_engineering.py +0 -0
  51. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_GUI_tools.py +0 -0
  52. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_IO_tools.py +0 -0
  53. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_MICE_imputation.py +0 -0
  54. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_callbacks.py +0 -0
  55. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_chaining_inference.py +0 -0
  56. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_chaining_utilities.py +0 -0
  57. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_configuration.py +0 -0
  58. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_configuration_pytab.py +0 -0
  59. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_datasetmaster.py +0 -0
  60. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_evaluation.py +0 -0
  61. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_evaluation_captum.py +0 -0
  62. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_evaluation_multi.py +0 -0
  63. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_finalize_handler.py +0 -0
  64. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_inference.py +0 -0
  65. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_models.py +0 -0
  66. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_models_advanced.py +0 -0
  67. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_models_pytab.py +0 -0
  68. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_optimization.py +0 -0
  69. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_optimization_pareto.py +0 -0
  70. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_scaler.py +0 -0
  71. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_datasetmaster.py +0 -0
  72. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_evaluation.py +0 -0
  73. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_inference.py +0 -0
  74. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_models.py +0 -0
  75. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_trainer.py +0 -0
  76. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_utilities.py +0 -0
  77. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_datasetmaster.py +0 -0
  78. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_evaluation.py +0 -0
  79. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_inference.py +0 -0
  80. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_models.py +0 -0
  81. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_transformers.py +0 -0
  82. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_PSO_optimization.py +0 -0
  83. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_SQL.py +0 -0
  84. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_VIF_factor.py +0 -0
  85. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/__init__.py +0 -0
  86. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_evaluation.py +0 -0
  87. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_inference.py +0 -0
  88. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_learning.py +0 -0
  89. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_excel_handler.py +0 -0
  90. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_keys.py +0 -0
  91. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_logger.py +0 -0
  92. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_math_utilities.py +0 -0
  93. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_models_advanced_base.py +0 -0
  94. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_models_advanced_helpers.py +0 -0
  95. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_optimization_tools.py +0 -0
  96. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_path_manager.py +0 -0
  97. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_plot_fonts.py +0 -0
  98. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_schema.py +0 -0
  99. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_script_info.py +0 -0
  100. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_serde.py +0 -0
  101. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_utilities.py +0 -0
  102. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/constants.py +0 -0
  103. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/data_exploration.py +0 -0
  104. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ensemble_evaluation.py +0 -0
  105. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ensemble_inference.py +0 -0
  106. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/ensemble_learning.py +0 -0
  107. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/excel_handler.py +0 -0
  108. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/keys.py +0 -0
  109. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/math_utilities.py +0 -0
  110. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/optimization_tools.py +0 -0
  111. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/path_manager.py +0 -0
  112. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/plot_fonts.py +0 -0
  113. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/schema.py +0 -0
  114. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/serde.py +0 -0
  115. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/ml_tools/utilities.py +0 -0
  116. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.13.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 19.12.2
3
+ Version: 19.13.0
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 19.12.2
3
+ Version: 19.13.0
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -28,19 +28,19 @@ __all__ = [
28
28
  "plot_value_distributions",
29
29
  "plot_continuous_vs_target",
30
30
  "plot_categorical_vs_target",
31
- "encode_categorical_features",
32
31
  "split_features_targets",
33
- "split_continuous_binary",
32
+ "encode_categorical_features",
34
33
  "clip_outliers_single",
35
34
  "clip_outliers_multi",
36
35
  "drop_outlier_samples",
37
36
  "plot_correlation_heatmap",
37
+ "finalize_feature_schema",
38
38
  "match_and_filter_columns_by_regex",
39
39
  "standardize_percentages",
40
40
  "reconstruct_one_hot",
41
41
  "reconstruct_binary",
42
42
  "reconstruct_multibinary",
43
- "finalize_feature_schema",
43
+ "split_continuous_binary",
44
44
  "apply_feature_schema"
45
45
  ]
46
46
 
@@ -108,22 +108,17 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
108
108
  for col_name in df_clean.columns:
109
109
  column = df_clean[col_name]
110
110
 
111
- # We can apply this logic to all columns or only focus on numeric ones.
112
- # if not is_numeric_dtype(column):
113
- # cols_to_keep.append(col_name)
114
- # continue
115
-
116
111
  # Keep a column if it has more than one unique value (nunique ignores NaNs by default)
117
112
  if column.nunique(dropna=True) > 1:
118
113
  cols_to_keep.append(col_name)
119
114
 
120
115
  dropped_columns = original_columns - set(cols_to_keep)
121
116
  if verbose:
122
- _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns.")
123
117
  if dropped_columns:
124
- for dropped_column in dropped_columns:
125
- print(f" {dropped_column}")
126
-
118
+ _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
119
+ else:
120
+ _LOGGER.info("No constant columns found.")
121
+
127
122
  # Return a new DataFrame with only the columns to keep
128
123
  df_clean = df_clean[cols_to_keep]
129
124
 
@@ -338,8 +333,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
338
333
  cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
339
334
 
340
335
  if len(cols_to_drop) > 0:
341
- _LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data:")
342
- print(list(cols_to_drop))
336
+ _LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
343
337
 
344
338
  result_df = df.drop(columns=cols_to_drop)
345
339
  if show_nulls_after:
@@ -369,9 +363,8 @@ def drop_macro(df: pd.DataFrame,
369
363
 
370
364
  Args:
371
365
  df (pd.DataFrame): The input pandas DataFrame to be cleaned.
372
- log_directory (Union[str, Path]): Path to the directory where the
373
- 'Missing_Data_start.csv' and 'Missing_Data_final.csv' logs
374
- will be saved.
366
+ log_directory (Union[str, Path]): Path to the directory where the missing data reports
367
+ and plots will be saved inside a "Missing Report" subdirectory.
375
368
  targets (list[str]): A list of column names to be treated as target
376
369
  variables. This list guides the row-dropping logic.
377
370
  skip_targets (bool, optional): If True, the columns listed in `targets`
@@ -387,15 +380,18 @@ def drop_macro(df: pd.DataFrame,
387
380
  # make a deep copy to work with
388
381
  df_clean = df.copy()
389
382
 
383
+ base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
384
+ full_path = base_dir_path / "Missing Report"
385
+
390
386
  # Log initial state + Plot
391
387
  missing_data_start = show_null_columns(
392
388
  df=df_clean,
393
- plot_to_dir=log_directory,
389
+ plot_to_dir=full_path,
394
390
  plot_filename="Original",
395
391
  use_all_columns=True
396
392
  )
397
393
  save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
398
- save_dir=log_directory,
394
+ save_dir=full_path,
399
395
  filename="Missing_Data_Original")
400
396
 
401
397
  # Clean cycles for rows and columns
@@ -424,12 +420,12 @@ def drop_macro(df: pd.DataFrame,
424
420
  # log final state + plot
425
421
  missing_data_final = show_null_columns(
426
422
  df=df_clean,
427
- plot_to_dir=log_directory,
423
+ plot_to_dir=full_path,
428
424
  plot_filename="Processed",
429
425
  use_all_columns=True
430
426
  )
431
427
  save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
432
- save_dir=log_directory,
428
+ save_dir=full_path,
433
429
  filename="Missing_Data_Processed")
434
430
 
435
431
  # return cleaned dataframe
@@ -476,9 +472,8 @@ def plot_value_distributions(
476
472
  df: pd.DataFrame,
477
473
  save_dir: Union[str, Path],
478
474
  categorical_columns: Optional[List[str]] = None,
479
- categorical_cardinality_threshold: int = 10,
480
- max_categories: int = 50,
481
- fill_na_with: str = "Missing"
475
+ max_categories: int = 100,
476
+ fill_na_with: str = "MISSING DATA"
482
477
  ):
483
478
  """
484
479
  Plots and saves the value distributions for all columns in a DataFrame,
@@ -491,15 +486,9 @@ def plot_value_distributions(
491
486
  Args:
492
487
  df (pd.DataFrame): The input DataFrame to analyze.
493
488
  save_dir (str | Path): Directory path to save the plots.
494
- categorical_columns (List[str] | None): If provided, this list
495
- of column names will be treated as categorical, and all other columns will be treated as continuous. This
496
- overrides the `continuous_cardinality_threshold` logic.
497
- categorical_cardinality_threshold (int): A numeric column will be treated
498
- as 'categorical' if its number of unique values is less than or equal to this threshold. (Ignored if `categorical_columns` is set).
499
- max_categories (int): The maximum number of unique categories a
500
- categorical feature can have to be plotted. Features exceeding this limit will be skipped.
501
- fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its
502
- own category. Defaults to "Missing".
489
+ categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
490
+ max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
491
+ fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
503
492
 
504
493
  Notes:
505
494
  - `seaborn.histplot` with KDE is used for continuous features.
@@ -534,7 +523,7 @@ def plot_value_distributions(
534
523
  is_continuous = True
535
524
  else:
536
525
  # Use auto-detection
537
- if is_numeric and n_unique > categorical_cardinality_threshold:
526
+ if is_numeric:
538
527
  is_continuous = True
539
528
 
540
529
  # --- Case 1: Continuous Numeric (Histogram) ---
@@ -549,7 +538,7 @@ def plot_value_distributions(
549
538
  save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
550
539
  numeric_plots_saved += 1
551
540
 
552
- # --- Case 2: Categorical or Low-Cardinality Numeric (Count Plot) ---
541
+ # --- Case 2: Categorical (Count Plot) ---
553
542
  else:
554
543
  # Check max categories
555
544
  if n_unique > max_categories:
@@ -558,7 +547,7 @@ def plot_value_distributions(
558
547
 
559
548
  # Adaptive figure size
560
549
  fig_width = max(10, n_unique * 0.5)
561
- plt.figure(figsize=(fig_width, 7))
550
+ plt.figure(figsize=(fig_width, 8))
562
551
 
563
552
  # Make a temporary copy for plotting to handle NaNs
564
553
  temp_series = df[col_name].copy()
@@ -573,7 +562,7 @@ def plot_value_distributions(
573
562
 
574
563
  # Get category order by frequency
575
564
  order = temp_series.value_counts().index
576
- sns.countplot(x=temp_series, order=order, palette="viridis")
565
+ sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
577
566
 
578
567
  plt.title(f"Distribution of '{col_name}' (Categorical)")
579
568
  plt.xlabel(col_name)
@@ -743,12 +732,11 @@ def plot_categorical_vs_target(
743
732
  targets: List[str],
744
733
  save_dir: Union[str, Path],
745
734
  features: Optional[List[str]] = None,
746
- plot_type: Literal["box", "violin"] = "box",
747
- max_categories: int = 20,
748
- fill_na_with: str = "Missing"
735
+ max_categories: int = 50,
736
+ fill_na_with: str = "MISSING DATA"
749
737
  ):
750
738
  """
751
- Plots each categorical feature against each numeric target using box or violin plots.
739
+ Plots each categorical feature against each numeric target using box plots.
752
740
 
753
741
  This function is a core EDA step for regression tasks to understand the
754
742
  relationship between a categorical independent variable and a continuous
@@ -761,7 +749,6 @@ def plot_categorical_vs_target(
761
749
  targets (List[str]): A list of numeric target column names (y-axis).
762
750
  save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
763
751
  features (List[str] | None): A list of categorical feature column names (x-axis). If None, all non-numeric (object) columns will be used.
764
- plot_type (Literal["box", "violin"]): The type of plot to generate.
765
752
  max_categories (int): The maximum number of unique categories a feature can have to be plotted. Features exceeding this limit will be skipped.
766
753
  fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category. Defaults to "Missing".
767
754
 
@@ -771,10 +758,6 @@ def plot_categorical_vs_target(
771
758
  """
772
759
  # 1. Validate the base save directory and inputs
773
760
  base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
774
-
775
- if plot_type not in ["box", "violin"]:
776
- _LOGGER.error(f"Invalid plot type '{plot_type}'")
777
- raise ValueError()
778
761
 
779
762
  # 2. Validate target columns (must be numeric)
780
763
  valid_targets = []
@@ -797,14 +780,10 @@ def plot_categorical_vs_target(
797
780
  for col in df.columns:
798
781
  if col in valid_targets:
799
782
  continue
800
-
801
783
  # Auto-include object dtypes
802
784
  if is_object_dtype(df[col]):
803
785
  features_to_plot.append(col)
804
- # Auto-include low-cardinality numeric features - REMOVED
805
- # elif is_numeric_dtype(df[col]) and df[col].nunique() <= max_categories:
806
- # _LOGGER.info(f"Treating low-cardinality numeric column '{col}' as categorical.")
807
- # features_to_plot.append(col)
786
+
808
787
  else:
809
788
  # Validate user-provided list
810
789
  for col in features:
@@ -822,12 +801,11 @@ def plot_categorical_vs_target(
822
801
 
823
802
  for target_name in valid_targets:
824
803
  # Create a sanitized subdirectory for this target
825
- safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical_{plot_type}")
804
+ safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
826
805
  target_save_dir = base_save_path / safe_target_dir_name
827
806
  target_save_dir.mkdir(parents=True, exist_ok=True)
828
807
 
829
- _LOGGER.info(f"Generating '{plot_type}' plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
830
-
808
+ _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
831
809
  for feature_name in features_to_plot:
832
810
 
833
811
  # Make a temporary copy for plotting to handle NaNs and dtypes
@@ -849,12 +827,9 @@ def plot_categorical_vs_target(
849
827
 
850
828
  # 5. Create the plot
851
829
  # Increase figure width for categories
852
- plt.figure(figsize=(max(10, n_unique * 1.2), 7))
830
+ plt.figure(figsize=(max(10, n_unique * 1.2), 10))
853
831
 
854
- if plot_type == "box":
855
- sns.boxplot(x=feature_name, y=target_name, data=temp_df)
856
- elif plot_type == "violin":
857
- sns.violinplot(x=feature_name, y=target_name, data=temp_df)
832
+ sns.boxplot(x=feature_name, y=target_name, data=temp_df)
858
833
 
859
834
  plt.title(f'{target_name} vs {feature_name}')
860
835
  plt.xlabel(feature_name)
@@ -982,7 +957,7 @@ def encode_categorical_features(
982
957
 
983
958
  # Handle the dataset splitting logic
984
959
  if split_resulting_dataset:
985
- df_categorical = df_encoded[valid_columns].to_frame() # type: ignore
960
+ df_categorical = df_encoded[valid_columns]
986
961
  df_non_categorical = df.drop(columns=valid_columns)
987
962
  return mappings, df_non_categorical, df_categorical
988
963
  else:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "19.12.2"
3
+ version = "19.13.0"
4
4
  description = "Complete pipelines and helper tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Luigi Loza Vidaurre", email = "luigiloza@gmail.com" }