dragon-ml-toolbox 19.12.2__tar.gz → 19.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. {dragon_ml_toolbox-19.12.2/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-19.14.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_data_exploration.py +152 -169
  4. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/pyproject.toml +1 -1
  5. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/LICENSE +0 -0
  6. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/LICENSE-THIRD-PARTY.md +0 -0
  7. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/README.md +0 -0
  8. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  9. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  10. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  11. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  12. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ETL_cleaning.py +0 -0
  13. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ETL_engineering.py +0 -0
  14. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/GUI_tools.py +0 -0
  15. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/IO_tools.py +0 -0
  16. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/MICE_imputation.py +0 -0
  17. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_callbacks.py +0 -0
  18. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_chaining_inference.py +0 -0
  19. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_chaining_utilities.py +0 -0
  20. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_configuration.py +0 -0
  21. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_configuration_pytab.py +0 -0
  22. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_datasetmaster.py +0 -0
  23. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_evaluation.py +0 -0
  24. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_evaluation_captum.py +0 -0
  25. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_evaluation_multi.py +0 -0
  26. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_finalize_handler.py +0 -0
  27. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_inference.py +0 -0
  28. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_models.py +0 -0
  29. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_models_advanced.py +0 -0
  30. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_models_pytab.py +0 -0
  31. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_optimization.py +0 -0
  32. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_optimization_pareto.py +0 -0
  33. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_scaler.py +0 -0
  34. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_sequence_datasetmaster.py +0 -0
  35. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_sequence_evaluation.py +0 -0
  36. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_sequence_inference.py +0 -0
  37. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_sequence_models.py +0 -0
  38. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_trainer.py +0 -0
  39. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_utilities.py +0 -0
  40. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_vision_datasetmaster.py +0 -0
  41. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_vision_evaluation.py +0 -0
  42. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_vision_inference.py +0 -0
  43. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_vision_models.py +0 -0
  44. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_vision_transformers.py +0 -0
  45. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/PSO_optimization.py +0 -0
  46. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/SQL.py +0 -0
  47. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/VIF_factor.py +0 -0
  48. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/__init__.py +0 -0
  49. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ETL_cleaning.py +0 -0
  50. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ETL_engineering.py +0 -0
  51. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_GUI_tools.py +0 -0
  52. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_IO_tools.py +0 -0
  53. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_MICE_imputation.py +0 -0
  54. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_callbacks.py +0 -0
  55. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_chaining_inference.py +0 -0
  56. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_chaining_utilities.py +0 -0
  57. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_configuration.py +0 -0
  58. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_configuration_pytab.py +0 -0
  59. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_datasetmaster.py +0 -0
  60. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_evaluation.py +0 -0
  61. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_evaluation_captum.py +0 -0
  62. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_evaluation_multi.py +0 -0
  63. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_finalize_handler.py +0 -0
  64. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_inference.py +0 -0
  65. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_models.py +0 -0
  66. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_models_advanced.py +0 -0
  67. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_models_pytab.py +0 -0
  68. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_optimization.py +0 -0
  69. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_optimization_pareto.py +0 -0
  70. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_scaler.py +0 -0
  71. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_datasetmaster.py +0 -0
  72. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_evaluation.py +0 -0
  73. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_inference.py +0 -0
  74. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_models.py +0 -0
  75. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_trainer.py +0 -0
  76. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_utilities.py +0 -0
  77. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_datasetmaster.py +0 -0
  78. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_evaluation.py +0 -0
  79. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_inference.py +0 -0
  80. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_models.py +0 -0
  81. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_transformers.py +0 -0
  82. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_PSO_optimization.py +0 -0
  83. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_SQL.py +0 -0
  84. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_VIF_factor.py +0 -0
  85. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/__init__.py +0 -0
  86. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ensemble_evaluation.py +0 -0
  87. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ensemble_inference.py +0 -0
  88. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ensemble_learning.py +0 -0
  89. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_excel_handler.py +0 -0
  90. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_keys.py +0 -0
  91. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_logger.py +0 -0
  92. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_math_utilities.py +0 -0
  93. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_models_advanced_base.py +0 -0
  94. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_models_advanced_helpers.py +0 -0
  95. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_optimization_tools.py +0 -0
  96. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_path_manager.py +0 -0
  97. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_plot_fonts.py +0 -0
  98. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_schema.py +0 -0
  99. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_script_info.py +0 -0
  100. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_serde.py +0 -0
  101. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_utilities.py +0 -0
  102. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/constants.py +0 -0
  103. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/data_exploration.py +0 -0
  104. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ensemble_evaluation.py +0 -0
  105. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ensemble_inference.py +0 -0
  106. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/ensemble_learning.py +0 -0
  107. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/excel_handler.py +0 -0
  108. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/keys.py +0 -0
  109. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/math_utilities.py +0 -0
  110. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/optimization_tools.py +0 -0
  111. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/path_manager.py +0 -0
  112. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/plot_fonts.py +0 -0
  113. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/schema.py +0 -0
  114. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/serde.py +0 -0
  115. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/ml_tools/utilities.py +0 -0
  116. {dragon_ml_toolbox-19.12.2 → dragon_ml_toolbox-19.14.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 19.12.2
3
+ Version: 19.14.0
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 19.12.2
3
+ Version: 19.14.0
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -26,21 +26,21 @@ __all__ = [
26
26
  "drop_macro",
27
27
  "clean_column_names",
28
28
  "plot_value_distributions",
29
- "plot_continuous_vs_target",
30
- "plot_categorical_vs_target",
31
- "encode_categorical_features",
32
29
  "split_features_targets",
33
- "split_continuous_binary",
30
+ "encode_categorical_features",
34
31
  "clip_outliers_single",
35
32
  "clip_outliers_multi",
36
33
  "drop_outlier_samples",
34
+ "plot_continuous_vs_target",
35
+ "plot_categorical_vs_target",
37
36
  "plot_correlation_heatmap",
37
+ "finalize_feature_schema",
38
38
  "match_and_filter_columns_by_regex",
39
39
  "standardize_percentages",
40
40
  "reconstruct_one_hot",
41
41
  "reconstruct_binary",
42
42
  "reconstruct_multibinary",
43
- "finalize_feature_schema",
43
+ "split_continuous_binary",
44
44
  "apply_feature_schema"
45
45
  ]
46
46
 
@@ -59,16 +59,18 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
59
59
  """
60
60
  summary = pd.DataFrame({
61
61
  'Data Type': df.dtypes,
62
- 'Non-Null Count': df.notnull().sum(),
62
+ 'Completeness %': (df.notnull().mean() * 100).round(2),
63
63
  'Unique Values': df.nunique(),
64
- 'Missing %': (df.isnull().mean() * 100).round(round_digits)
64
+ # 'Missing %': (df.isnull().mean() * 100).round(2)
65
65
  })
66
66
 
67
67
  # For numeric columns, add summary statistics
68
68
  numeric_cols = df.select_dtypes(include='number').columns
69
69
  if not numeric_cols.empty:
70
- summary_numeric = df[numeric_cols].describe().T[
71
- ['mean', 'std', 'min', '25%', '50%', '75%', 'max']
70
+ stats = df[numeric_cols].describe(percentiles=[.10, .25, .50, .70, .80, .90])
71
+
72
+ summary_numeric = stats.T[
73
+ ['mean', 'std', 'min', '10%', '25%', '50%', '70%', '80%', '90%', 'max']
72
74
  ].round(round_digits)
73
75
  summary = summary.join(summary_numeric, how='left')
74
76
 
@@ -108,22 +110,17 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
108
110
  for col_name in df_clean.columns:
109
111
  column = df_clean[col_name]
110
112
 
111
- # We can apply this logic to all columns or only focus on numeric ones.
112
- # if not is_numeric_dtype(column):
113
- # cols_to_keep.append(col_name)
114
- # continue
115
-
116
113
  # Keep a column if it has more than one unique value (nunique ignores NaNs by default)
117
114
  if column.nunique(dropna=True) > 1:
118
115
  cols_to_keep.append(col_name)
119
116
 
120
117
  dropped_columns = original_columns - set(cols_to_keep)
121
118
  if verbose:
122
- _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns.")
123
119
  if dropped_columns:
124
- for dropped_column in dropped_columns:
125
- print(f" {dropped_column}")
126
-
120
+ _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
121
+ else:
122
+ _LOGGER.info("No constant columns found.")
123
+
127
124
  # Return a new DataFrame with only the columns to keep
128
125
  df_clean = df_clean[cols_to_keep]
129
126
 
@@ -338,8 +335,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
338
335
  cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
339
336
 
340
337
  if len(cols_to_drop) > 0:
341
- _LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data:")
342
- print(list(cols_to_drop))
338
+ _LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
343
339
 
344
340
  result_df = df.drop(columns=cols_to_drop)
345
341
  if show_nulls_after:
@@ -369,9 +365,8 @@ def drop_macro(df: pd.DataFrame,
369
365
 
370
366
  Args:
371
367
  df (pd.DataFrame): The input pandas DataFrame to be cleaned.
372
- log_directory (Union[str, Path]): Path to the directory where the
373
- 'Missing_Data_start.csv' and 'Missing_Data_final.csv' logs
374
- will be saved.
368
+ log_directory (Union[str, Path]): Path to the directory where the missing data reports
369
+ and plots will be saved inside a "Missing Report" subdirectory.
375
370
  targets (list[str]): A list of column names to be treated as target
376
371
  variables. This list guides the row-dropping logic.
377
372
  skip_targets (bool, optional): If True, the columns listed in `targets`
@@ -387,15 +382,18 @@ def drop_macro(df: pd.DataFrame,
387
382
  # make a deep copy to work with
388
383
  df_clean = df.copy()
389
384
 
385
+ base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
386
+ full_path = base_dir_path / "Missing Report"
387
+
390
388
  # Log initial state + Plot
391
389
  missing_data_start = show_null_columns(
392
390
  df=df_clean,
393
- plot_to_dir=log_directory,
391
+ plot_to_dir=full_path,
394
392
  plot_filename="Original",
395
393
  use_all_columns=True
396
394
  )
397
395
  save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
398
- save_dir=log_directory,
396
+ save_dir=full_path,
399
397
  filename="Missing_Data_Original")
400
398
 
401
399
  # Clean cycles for rows and columns
@@ -424,12 +422,12 @@ def drop_macro(df: pd.DataFrame,
424
422
  # log final state + plot
425
423
  missing_data_final = show_null_columns(
426
424
  df=df_clean,
427
- plot_to_dir=log_directory,
425
+ plot_to_dir=full_path,
428
426
  plot_filename="Processed",
429
427
  use_all_columns=True
430
428
  )
431
429
  save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
432
- save_dir=log_directory,
430
+ save_dir=full_path,
433
431
  filename="Missing_Data_Processed")
434
432
 
435
433
  # return cleaned dataframe
@@ -476,9 +474,8 @@ def plot_value_distributions(
476
474
  df: pd.DataFrame,
477
475
  save_dir: Union[str, Path],
478
476
  categorical_columns: Optional[List[str]] = None,
479
- categorical_cardinality_threshold: int = 10,
480
- max_categories: int = 50,
481
- fill_na_with: str = "Missing"
477
+ max_categories: int = 100,
478
+ fill_na_with: str = "MISSING DATA"
482
479
  ):
483
480
  """
484
481
  Plots and saves the value distributions for all columns in a DataFrame,
@@ -491,15 +488,9 @@ def plot_value_distributions(
491
488
  Args:
492
489
  df (pd.DataFrame): The input DataFrame to analyze.
493
490
  save_dir (str | Path): Directory path to save the plots.
494
- categorical_columns (List[str] | None): If provided, this list
495
- of column names will be treated as categorical, and all other columns will be treated as continuous. This
496
- overrides the `continuous_cardinality_threshold` logic.
497
- categorical_cardinality_threshold (int): A numeric column will be treated
498
- as 'categorical' if its number of unique values is less than or equal to this threshold. (Ignored if `categorical_columns` is set).
499
- max_categories (int): The maximum number of unique categories a
500
- categorical feature can have to be plotted. Features exceeding this limit will be skipped.
501
- fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its
502
- own category. Defaults to "Missing".
491
+ categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
492
+ max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
493
+ fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
503
494
 
504
495
  Notes:
505
496
  - `seaborn.histplot` with KDE is used for continuous features.
@@ -534,7 +525,7 @@ def plot_value_distributions(
534
525
  is_continuous = True
535
526
  else:
536
527
  # Use auto-detection
537
- if is_numeric and n_unique > categorical_cardinality_threshold:
528
+ if is_numeric:
538
529
  is_continuous = True
539
530
 
540
531
  # --- Case 1: Continuous Numeric (Histogram) ---
@@ -549,7 +540,7 @@ def plot_value_distributions(
549
540
  save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
550
541
  numeric_plots_saved += 1
551
542
 
552
- # --- Case 2: Categorical or Low-Cardinality Numeric (Count Plot) ---
543
+ # --- Case 2: Categorical (Count Plot) ---
553
544
  else:
554
545
  # Check max categories
555
546
  if n_unique > max_categories:
@@ -558,7 +549,7 @@ def plot_value_distributions(
558
549
 
559
550
  # Adaptive figure size
560
551
  fig_width = max(10, n_unique * 0.5)
561
- plt.figure(figsize=(fig_width, 7))
552
+ plt.figure(figsize=(fig_width, 8))
562
553
 
563
554
  # Make a temporary copy for plotting to handle NaNs
564
555
  temp_series = df[col_name].copy()
@@ -573,7 +564,7 @@ def plot_value_distributions(
573
564
 
574
565
  # Get category order by frequency
575
566
  order = temp_series.value_counts().index
576
- sns.countplot(x=temp_series, order=order, palette="viridis")
567
+ sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
577
568
 
578
569
  plt.title(f"Distribution of '{col_name}' (Categorical)")
579
570
  plt.xlabel(col_name)
@@ -607,68 +598,55 @@ def plot_value_distributions(
607
598
 
608
599
 
609
600
  def plot_continuous_vs_target(
610
- df: pd.DataFrame,
611
- targets: List[str],
601
+ df_continuous: pd.DataFrame,
602
+ df_targets: pd.DataFrame,
612
603
  save_dir: Union[str, Path],
613
- features: Optional[List[str]] = None
604
+ verbose: int = 1
614
605
  ):
615
606
  """
616
- Plots each continuous feature against each target to visualize linear relationships.
607
+ Plots each continuous feature from df_continuous against each target in df_targets.
617
608
 
618
- This function is a common EDA step for regression tasks. It creates a
619
- scatter plot for each feature-target pair, overlays a simple linear
620
- regression line, and saves each plot as an individual .svg file.
609
+ This function creates a scatter plot for each feature-target pair, overlays a
610
+ simple linear regression line, and saves each plot as an individual .svg file.
621
611
 
622
612
  Plots are saved in a structured way, with a subdirectory created for
623
613
  each target variable.
624
614
 
625
615
  Args:
626
- df (pd.DataFrame): The input DataFrame.
627
- targets (List[str]): A list of target column names to plot (y-axis).
628
- save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
629
- features (List[str] | None): A list of feature column names to plot (x-axis). If None, all non-target columns in the
630
- DataFrame will be used.
616
+ df_continuous (pd.DataFrame): DataFrame containing continuous feature columns (x-axis).
617
+ df_targets (pd.DataFrame): DataFrame containing target columns (y-axis).
618
+ save_dir (str | Path): The base directory where plots will be saved.
619
+ verbose (int): Verbosity level for logging warnings.
631
620
 
632
621
  Notes:
633
- - Only numeric features and numeric targets are processed. Non-numeric
634
- columns in the lists will be skipped with a warning.
635
- - Rows with NaN in either the feature or the target are dropped
636
- pairwise for each plot.
622
+ - Only numeric features and numeric targets are processed.
623
+ - Rows with NaN in either the feature or the target are dropped pairwise.
624
+ - Assumes df_continuous and df_targets share the same index.
637
625
  """
638
626
  # 1. Validate the base save directory
639
627
  base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
640
628
 
641
- # 2. Validate helper
642
- def _validate_numeric_cols(col_list: List[str], col_type: str) -> List[str]:
629
+ # 2. Validation helper
630
+ def _get_valid_numeric_cols(df: pd.DataFrame, df_name: str) -> List[str]:
643
631
  valid_cols = []
644
- for col in col_list:
645
- if col not in df.columns:
646
- _LOGGER.warning(f"{col_type} column '{col}' not found. Skipping.")
647
- elif not is_numeric_dtype(df[col]):
648
- _LOGGER.warning(f"{col_type} column '{col}' is not numeric. Skipping.")
632
+ for col in df.columns:
633
+ if not is_numeric_dtype(df[col]):
634
+ if verbose > 0:
635
+ _LOGGER.warning(f"Column '{col}' in {df_name} is not numeric. Skipping.")
649
636
  else:
650
637
  valid_cols.append(col)
651
638
  return valid_cols
652
639
 
653
- # 3. Validate target columns FIRST
654
- valid_targets = _validate_numeric_cols(targets, "Target")
640
+ # 3. Validate target columns
641
+ valid_targets = _get_valid_numeric_cols(df_targets, "df_targets")
655
642
  if not valid_targets:
656
- _LOGGER.error("No valid numeric target columns provided to plot.")
643
+ _LOGGER.error("No valid numeric target columns provided in df_targets.")
657
644
  return
658
645
 
659
- # 4. Determine and validate feature columns
660
- if features is None:
661
- _LOGGER.info("No 'features' list provided. Using all non-target columns as features.")
662
- target_set = set(valid_targets)
663
- # Get all columns that are not in the valid_targets set
664
- features_to_validate = [col for col in df.columns if col not in target_set]
665
- else:
666
- features_to_validate = features
667
-
668
- valid_features = _validate_numeric_cols(features_to_validate, "Feature")
669
-
646
+ # 4. Validate feature columns
647
+ valid_features = _get_valid_numeric_cols(df_continuous, "df_continuous")
670
648
  if not valid_features:
671
- _LOGGER.error("No valid numeric feature columns found to plot.")
649
+ _LOGGER.error("No valid numeric feature columns provided in df_continuous.")
672
650
  return
673
651
 
674
652
  # 5. Main plotting loop
@@ -680,15 +658,20 @@ def plot_continuous_vs_target(
680
658
  target_save_dir = base_save_path / safe_target_dir_name
681
659
  target_save_dir.mkdir(parents=True, exist_ok=True)
682
660
 
683
- _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
661
+ if verbose > 0:
662
+ _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
684
663
 
685
664
  for feature_name in valid_features:
686
665
 
687
- # Drop NaNs pairwise for this specific plot
688
- temp_df = df[[feature_name, target_name]].dropna()
666
+ # Align data and drop NaNs pairwise - use concat to ensure we respect the index alignment between the two DFs
667
+ temp_df = pd.concat([
668
+ df_continuous[feature_name],
669
+ df_targets[target_name]
670
+ ], axis=1).dropna()
689
671
 
690
672
  if temp_df.empty:
691
- _LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
673
+ if verbose > 1:
674
+ _LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
692
675
  continue
693
676
 
694
677
  x = temp_df[feature_name]
@@ -696,11 +679,12 @@ def plot_continuous_vs_target(
696
679
 
697
680
  # 6. Perform linear fit
698
681
  try:
699
- # Modern replacement for np.polyfit + np.poly1d. Compatible with NumPy 1.14+ and NumPy 2.0+
682
+ # Modern replacement for np.polyfit + np.poly1d
700
683
  p = np.polynomial.Polynomial.fit(x, y, deg=1)
701
684
  plot_regression_line = True
702
685
  except (np.linalg.LinAlgError, ValueError):
703
- _LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
686
+ if verbose > 0:
687
+ _LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
704
688
  plot_regression_line = False
705
689
 
706
690
  # 7. Create the plot
@@ -734,87 +718,68 @@ def plot_continuous_vs_target(
734
718
 
735
719
  # Close the figure to free up memory
736
720
  plt.close()
737
-
738
- _LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
721
+
722
+ if verbose > 0:
723
+ _LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
739
724
 
740
725
 
741
726
  def plot_categorical_vs_target(
742
- df: pd.DataFrame,
743
- targets: List[str],
727
+ df_categorical: pd.DataFrame,
728
+ df_targets: pd.DataFrame,
744
729
  save_dir: Union[str, Path],
745
- features: Optional[List[str]] = None,
746
- plot_type: Literal["box", "violin"] = "box",
747
- max_categories: int = 20,
748
- fill_na_with: str = "Missing"
730
+ max_categories: int = 50,
731
+ fill_na_with: str = "MISSING DATA",
732
+ drop_empty_targets: bool = True,
733
+ verbose: int = 1
749
734
  ):
750
735
  """
751
- Plots each categorical feature against each numeric target using box or violin plots.
736
+ Plots each feature in df_categorical against each numeric target in df_targets using box plots.
752
737
 
753
- This function is a core EDA step for regression tasks to understand the
754
- relationship between a categorical independent variable and a continuous
755
- dependent variable.
756
-
757
- Plots are saved as individual .svg files in a structured way, with a subdirectory created for each target.
738
+ Automatically aligns the two DataFrames by index. If a numeric
739
+ column is passed within df_categorical, it will be cast to object type to treat it as a category.
758
740
 
759
741
  Args:
760
- df (pd.DataFrame): The input DataFrame.
761
- targets (List[str]): A list of numeric target column names (y-axis).
762
- save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
763
- features (List[str] | None): A list of categorical feature column names (x-axis). If None, all non-numeric (object) columns will be used.
764
- plot_type (Literal["box", "violin"]): The type of plot to generate.
765
- max_categories (int): The maximum number of unique categories a feature can have to be plotted. Features exceeding this limit will be skipped.
766
- fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category. Defaults to "Missing".
742
+ df_categorical (pd.DataFrame): DataFrame containing categorical feature columns (x-axis).
743
+ df_targets (pd.DataFrame): DataFrame containing numeric target columns (y-axis).
744
+ save_dir (str | Path): Base directory for saving plots.
745
+ max_categories (int): The maximum number of unique categories a feature can have to be plotted.
746
+ fill_na_with (str): String to replace NaN values in categorical columns.
747
+ drop_empty_targets (bool): If True, drops rows where the target value is NaN before plotting.
748
+ verbose (int): Verbosity level for logging warnings.
767
749
 
768
750
  Notes:
769
- - Only numeric targets are processed.
770
- - Features are automatically identified as categorical if they are 'object' dtype.
751
+ - Assumes df_categorical and df_targets share the same index.
771
752
  """
772
- # 1. Validate the base save directory and inputs
753
+ # 1. Validate the base save directory
773
754
  base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
774
-
775
- if plot_type not in ["box", "violin"]:
776
- _LOGGER.error(f"Invalid plot type '{plot_type}'")
777
- raise ValueError()
778
755
 
779
756
  # 2. Validate target columns (must be numeric)
780
757
  valid_targets = []
781
- for col in targets:
782
- if col not in df.columns:
783
- _LOGGER.warning(f"Target column '{col}' not found. Skipping.")
784
- elif not is_numeric_dtype(df[col]):
785
- _LOGGER.warning(f"Target column '{col}' is not numeric. Skipping.")
758
+ for col in df_targets.columns:
759
+ if not is_numeric_dtype(df_targets[col]):
760
+ if verbose > 0:
761
+ _LOGGER.warning(f"Target column '{col}' in df_targets is not numeric. Skipping.")
786
762
  else:
787
763
  valid_targets.append(col)
788
764
 
789
765
  if not valid_targets:
790
- _LOGGER.error("No valid numeric target columns provided to plot.")
766
+ _LOGGER.error("No valid numeric target columns provided in df_targets.")
791
767
  return
792
768
 
793
- # 3. Determine and validate feature columns
794
- features_to_plot = []
795
- if features is None:
796
- _LOGGER.info("No 'features' list provided. Auto-detecting categorical features.")
797
- for col in df.columns:
798
- if col in valid_targets:
799
- continue
800
-
801
- # Auto-include object dtypes
802
- if is_object_dtype(df[col]):
803
- features_to_plot.append(col)
804
- # Auto-include low-cardinality numeric features - REMOVED
805
- # elif is_numeric_dtype(df[col]) and df[col].nunique() <= max_categories:
806
- # _LOGGER.info(f"Treating low-cardinality numeric column '{col}' as categorical.")
807
- # features_to_plot.append(col)
808
- else:
809
- # Validate user-provided list
810
- for col in features:
811
- if col not in df.columns:
812
- _LOGGER.warning(f"Feature column '{col}' not found in DataFrame. Skipping.")
813
- else:
814
- features_to_plot.append(col)
769
+ # 3. Validate feature columns (Flexible: Allow numeric but warn)
770
+ valid_features = []
771
+ for col in df_categorical.columns:
772
+ # If numeric, warn but accept it (will be cast to object later)
773
+ if is_numeric_dtype(df_categorical[col]):
774
+ if verbose > 0:
775
+ _LOGGER.warning(f"Feature '{col}' in df_categorical is numeric. It will be cast to 'object' and treated as categorical.")
776
+ valid_features.append(col)
777
+ else:
778
+ # Assume it is already object/category
779
+ valid_features.append(col)
815
780
 
816
- if not features_to_plot:
817
- _LOGGER.error("No valid categorical feature columns found to plot.")
781
+ if not valid_features:
782
+ _LOGGER.error("No valid feature columns provided in df_categorical.")
818
783
  return
819
784
 
820
785
  # 4. Main plotting loop
@@ -822,39 +787,53 @@ def plot_categorical_vs_target(
822
787
 
823
788
  for target_name in valid_targets:
824
789
  # Create a sanitized subdirectory for this target
825
- safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical_{plot_type}")
790
+ safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
826
791
  target_save_dir = base_save_path / safe_target_dir_name
827
792
  target_save_dir.mkdir(parents=True, exist_ok=True)
828
793
 
829
- _LOGGER.info(f"Generating '{plot_type}' plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
794
+ if verbose > 0:
795
+ _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
796
+
797
+ for feature_name in valid_features:
798
+
799
+ # Align data using concat to respect indices
800
+ feature_series = df_categorical[feature_name]
801
+ target_series = df_targets[target_name]
802
+
803
+ # Create a temporary DataFrame for this pair
804
+ temp_df = pd.concat([feature_series, target_series], axis=1)
805
+
806
+ # Optional: Drop rows where the target is NaN
807
+ if drop_empty_targets:
808
+ temp_df = temp_df.dropna(subset=[target_name])
809
+ if temp_df.empty:
810
+ if verbose > 1:
811
+ _LOGGER.warning(f"No valid data left for '{feature_name}' vs '{target_name}' after dropping empty targets. Skipping.")
812
+ continue
830
813
 
831
- for feature_name in features_to_plot:
814
+ # Force feature to object if it isn't already (handling the numeric flexibility)
815
+ if not is_object_dtype(temp_df[feature_name]):
816
+ temp_df[feature_name] = temp_df[feature_name].astype(object)
817
+
818
+ # Handle NaNs in the feature column (treat as a category)
819
+ if temp_df[feature_name].isnull().any():
820
+ temp_df[feature_name] = temp_df[feature_name].fillna(fill_na_with)
832
821
 
833
- # Make a temporary copy for plotting to handle NaNs and dtypes
834
- temp_df = df[[feature_name, target_name]].copy()
822
+ # Convert to string to ensure consistent plotting and cardinality check
823
+ temp_df[feature_name] = temp_df[feature_name].astype(str)
835
824
 
836
825
  # Check cardinality
837
826
  n_unique = temp_df[feature_name].nunique()
838
827
  if n_unique > max_categories:
839
- _LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique values > {max_categories} max_categories.")
828
+ if verbose > 1:
829
+ _LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique categories > {max_categories} max_categories.")
840
830
  continue
841
-
842
- # Handle NaNs by replacing them with the specified string
843
- if temp_df[feature_name].isnull().any():
844
- # Convert to object type first to allow string replacement
845
- temp_df[feature_name] = temp_df[feature_name].astype(object).fillna(fill_na_with)
846
-
847
- # Convert feature to string to ensure correct plotting order
848
- temp_df[feature_name] = temp_df[feature_name].astype(str)
849
831
 
850
832
  # 5. Create the plot
851
- # Increase figure width for categories
852
- plt.figure(figsize=(max(10, n_unique * 1.2), 7))
833
+ # Dynamic figure width based on number of categories
834
+ plt.figure(figsize=(max(10, n_unique * 0.8), 10))
853
835
 
854
- if plot_type == "box":
855
- sns.boxplot(x=feature_name, y=target_name, data=temp_df)
856
- elif plot_type == "violin":
857
- sns.violinplot(x=feature_name, y=target_name, data=temp_df)
836
+ sns.boxplot(x=feature_name, y=target_name, data=temp_df)
858
837
 
859
838
  plt.title(f'{target_name} vs {feature_name}')
860
839
  plt.xlabel(feature_name)
@@ -875,8 +854,9 @@ def plot_categorical_vs_target(
875
854
  _LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
876
855
 
877
856
  plt.close()
878
-
879
- _LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
857
+
858
+ if verbose > 0:
859
+ _LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
880
860
 
881
861
 
882
862
  def encode_categorical_features(
@@ -982,7 +962,7 @@ def encode_categorical_features(
982
962
 
983
963
  # Handle the dataset splitting logic
984
964
  if split_resulting_dataset:
985
- df_categorical = df_encoded[valid_columns].to_frame() # type: ignore
965
+ df_categorical = df_encoded[valid_columns]
986
966
  df_non_categorical = df.drop(columns=valid_columns)
987
967
  return mappings, df_non_categorical, df_categorical
988
968
  else:
@@ -1103,7 +1083,10 @@ def plot_correlation_heatmap(df: pd.DataFrame,
1103
1083
  annot=annot_bool,
1104
1084
  cmap='coolwarm',
1105
1085
  fmt=".2f",
1106
- cbar_kws={"shrink": 0.8}
1086
+ cbar_kws={"shrink": 0.8},
1087
+ vmin=-1, # Anchors minimum color to -1
1088
+ vmax=1, # Anchors maximum color to 1
1089
+ center=0 # Ensures 0 corresponds to the neutral color (white)
1107
1090
  )
1108
1091
 
1109
1092
  # add suffix to title
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "19.12.2"
3
+ version = "19.14.0"
4
4
  description = "Complete pipelines and helper tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Luigi Loza Vidaurre", email = "luigiloza@gmail.com" }