dragon-ml-toolbox 19.12.1__tar.gz → 19.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. {dragon_ml_toolbox-19.12.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-19.13.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_trainer.py +3 -3
  4. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_data_exploration.py +38 -62
  5. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_utilities.py +1 -1
  6. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/pyproject.toml +1 -1
  7. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/LICENSE +0 -0
  8. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/LICENSE-THIRD-PARTY.md +0 -0
  9. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/README.md +0 -0
  10. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  11. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  12. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  13. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  14. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ETL_cleaning.py +0 -0
  15. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ETL_engineering.py +0 -0
  16. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/GUI_tools.py +0 -0
  17. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/IO_tools.py +0 -0
  18. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/MICE_imputation.py +0 -0
  19. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_callbacks.py +0 -0
  20. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_chaining_inference.py +0 -0
  21. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_chaining_utilities.py +0 -0
  22. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_configuration.py +0 -0
  23. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_configuration_pytab.py +0 -0
  24. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_datasetmaster.py +0 -0
  25. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_evaluation.py +0 -0
  26. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_evaluation_captum.py +0 -0
  27. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_evaluation_multi.py +0 -0
  28. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_finalize_handler.py +0 -0
  29. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_inference.py +0 -0
  30. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_models.py +0 -0
  31. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_models_advanced.py +0 -0
  32. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_models_pytab.py +0 -0
  33. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_optimization.py +0 -0
  34. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_optimization_pareto.py +0 -0
  35. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_scaler.py +0 -0
  36. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_datasetmaster.py +0 -0
  37. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_evaluation.py +0 -0
  38. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_inference.py +0 -0
  39. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_sequence_models.py +0 -0
  40. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_trainer.py +0 -0
  41. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_utilities.py +0 -0
  42. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_datasetmaster.py +0 -0
  43. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_evaluation.py +0 -0
  44. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_inference.py +0 -0
  45. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_models.py +0 -0
  46. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ML_vision_transformers.py +0 -0
  47. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/PSO_optimization.py +0 -0
  48. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/SQL.py +0 -0
  49. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/VIF_factor.py +0 -0
  50. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/__init__.py +0 -0
  51. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ETL_cleaning.py +0 -0
  52. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ETL_engineering.py +0 -0
  53. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_GUI_tools.py +0 -0
  54. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_IO_tools.py +0 -0
  55. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_MICE_imputation.py +0 -0
  56. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_callbacks.py +0 -0
  57. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_chaining_inference.py +0 -0
  58. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_chaining_utilities.py +0 -0
  59. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_configuration.py +0 -0
  60. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_configuration_pytab.py +0 -0
  61. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_datasetmaster.py +0 -0
  62. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_evaluation.py +0 -0
  63. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_evaluation_captum.py +0 -0
  64. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_evaluation_multi.py +0 -0
  65. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_finalize_handler.py +0 -0
  66. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_inference.py +0 -0
  67. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_models.py +0 -0
  68. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_models_advanced.py +0 -0
  69. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_models_pytab.py +0 -0
  70. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_optimization.py +0 -0
  71. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_optimization_pareto.py +0 -0
  72. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_scaler.py +0 -0
  73. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_datasetmaster.py +0 -0
  74. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_evaluation.py +0 -0
  75. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_inference.py +0 -0
  76. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_sequence_models.py +0 -0
  77. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_utilities.py +0 -0
  78. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_datasetmaster.py +0 -0
  79. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_evaluation.py +0 -0
  80. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_inference.py +0 -0
  81. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_models.py +0 -0
  82. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ML_vision_transformers.py +0 -0
  83. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_PSO_optimization.py +0 -0
  84. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_SQL.py +0 -0
  85. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_VIF_factor.py +0 -0
  86. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/__init__.py +0 -0
  87. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_evaluation.py +0 -0
  88. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_inference.py +0 -0
  89. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_ensemble_learning.py +0 -0
  90. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_excel_handler.py +0 -0
  91. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_keys.py +0 -0
  92. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_logger.py +0 -0
  93. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_math_utilities.py +0 -0
  94. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_models_advanced_base.py +0 -0
  95. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_models_advanced_helpers.py +0 -0
  96. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_optimization_tools.py +0 -0
  97. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_path_manager.py +0 -0
  98. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_plot_fonts.py +0 -0
  99. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_schema.py +0 -0
  100. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_script_info.py +0 -0
  101. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/_core/_serde.py +0 -0
  102. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/constants.py +0 -0
  103. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/data_exploration.py +3 -3
  104. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ensemble_evaluation.py +0 -0
  105. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ensemble_inference.py +0 -0
  106. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/ensemble_learning.py +0 -0
  107. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/excel_handler.py +0 -0
  108. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/keys.py +0 -0
  109. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/math_utilities.py +0 -0
  110. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/optimization_tools.py +0 -0
  111. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/path_manager.py +0 -0
  112. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/plot_fonts.py +0 -0
  113. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/schema.py +0 -0
  114. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/serde.py +0 -0
  115. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/ml_tools/utilities.py +0 -0
  116. {dragon_ml_toolbox-19.12.1 → dragon_ml_toolbox-19.13.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 19.12.1
3
+ Version: 19.13.0
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 19.12.1
3
+ Version: 19.13.0
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -663,7 +663,7 @@ class DragonTrainer(_BaseDragonTrainer):
663
663
  Evaluates the model, routing to the correct evaluation function based on task `kind`.
664
664
 
665
665
  Args:
666
- model_checkpoint ('auto' | Path | None):
666
+ model_checkpoint (Path | "best" | "current"):
667
667
  - Path to a valid checkpoint for the model. The state of the trained model will be overwritten in place.
668
668
  - If 'best', the best checkpoint will be loaded if a DragonModelCheckpoint was provided. The state of the trained model will be overwritten in place.
669
669
  - If 'current', use the current state of the trained model up the latest trained epoch.
@@ -1608,7 +1608,7 @@ class DragonDetectionTrainer(_BaseDragonTrainer):
1608
1608
 
1609
1609
  Args:
1610
1610
  save_dir (str | Path): Directory to save all reports and plots.
1611
- model_checkpoint ('auto' | Path | None):
1611
+ model_checkpoint (Path | "best" | "current"):
1612
1612
  - Path to a valid checkpoint for the model. The state of the trained model will be overwritten in place.
1613
1613
  - If 'best', the best checkpoint will be loaded if a DragonModelCheckpoint was provided. The state of the trained model will be overwritten in place.
1614
1614
  - If 'current', use the current state of the trained model up the latest trained epoch.
@@ -2046,7 +2046,7 @@ class DragonSequenceTrainer(_BaseDragonTrainer):
2046
2046
  Evaluates the model, routing to the correct evaluation function.
2047
2047
 
2048
2048
  Args:
2049
- model_checkpoint ('auto' | Path | None):
2049
+ model_checkpoint (Path | "best" | "current"):
2050
2050
  - Path to a valid checkpoint for the model.
2051
2051
  - If 'best', the best checkpoint will be loaded.
2052
2052
  - If 'current', use the current state of the trained model.
@@ -17,7 +17,6 @@ from ._schema import FeatureSchema
17
17
  _LOGGER = get_logger("Data Exploration")
18
18
 
19
19
 
20
- # Keep track of all available tools, show using `info()`
21
20
  __all__ = [
22
21
  "summarize_dataframe",
23
22
  "drop_constant_columns",
@@ -29,19 +28,19 @@ __all__ = [
29
28
  "plot_value_distributions",
30
29
  "plot_continuous_vs_target",
31
30
  "plot_categorical_vs_target",
32
- "encode_categorical_features",
33
31
  "split_features_targets",
34
- "split_continuous_binary",
32
+ "encode_categorical_features",
35
33
  "clip_outliers_single",
36
34
  "clip_outliers_multi",
37
35
  "drop_outlier_samples",
38
36
  "plot_correlation_heatmap",
37
+ "finalize_feature_schema",
39
38
  "match_and_filter_columns_by_regex",
40
39
  "standardize_percentages",
41
40
  "reconstruct_one_hot",
42
41
  "reconstruct_binary",
43
42
  "reconstruct_multibinary",
44
- "finalize_feature_schema",
43
+ "split_continuous_binary",
45
44
  "apply_feature_schema"
46
45
  ]
47
46
 
@@ -109,22 +108,17 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
109
108
  for col_name in df_clean.columns:
110
109
  column = df_clean[col_name]
111
110
 
112
- # We can apply this logic to all columns or only focus on numeric ones.
113
- # if not is_numeric_dtype(column):
114
- # cols_to_keep.append(col_name)
115
- # continue
116
-
117
111
  # Keep a column if it has more than one unique value (nunique ignores NaNs by default)
118
112
  if column.nunique(dropna=True) > 1:
119
113
  cols_to_keep.append(col_name)
120
114
 
121
115
  dropped_columns = original_columns - set(cols_to_keep)
122
116
  if verbose:
123
- _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns.")
124
117
  if dropped_columns:
125
- for dropped_column in dropped_columns:
126
- print(f" {dropped_column}")
127
-
118
+ _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
119
+ else:
120
+ _LOGGER.info("No constant columns found.")
121
+
128
122
  # Return a new DataFrame with only the columns to keep
129
123
  df_clean = df_clean[cols_to_keep]
130
124
 
@@ -339,8 +333,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
339
333
  cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
340
334
 
341
335
  if len(cols_to_drop) > 0:
342
- _LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data:")
343
- print(list(cols_to_drop))
336
+ _LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
344
337
 
345
338
  result_df = df.drop(columns=cols_to_drop)
346
339
  if show_nulls_after:
@@ -370,9 +363,8 @@ def drop_macro(df: pd.DataFrame,
370
363
 
371
364
  Args:
372
365
  df (pd.DataFrame): The input pandas DataFrame to be cleaned.
373
- log_directory (Union[str, Path]): Path to the directory where the
374
- 'Missing_Data_start.csv' and 'Missing_Data_final.csv' logs
375
- will be saved.
366
+ log_directory (Union[str, Path]): Path to the directory where the missing data reports
367
+ and plots will be saved inside a "Missing Report" subdirectory.
376
368
  targets (list[str]): A list of column names to be treated as target
377
369
  variables. This list guides the row-dropping logic.
378
370
  skip_targets (bool, optional): If True, the columns listed in `targets`
@@ -388,15 +380,18 @@ def drop_macro(df: pd.DataFrame,
388
380
  # make a deep copy to work with
389
381
  df_clean = df.copy()
390
382
 
383
+ base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
384
+ full_path = base_dir_path / "Missing Report"
385
+
391
386
  # Log initial state + Plot
392
387
  missing_data_start = show_null_columns(
393
388
  df=df_clean,
394
- plot_to_dir=log_directory,
389
+ plot_to_dir=full_path,
395
390
  plot_filename="Original",
396
391
  use_all_columns=True
397
392
  )
398
393
  save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
399
- save_dir=log_directory,
394
+ save_dir=full_path,
400
395
  filename="Missing_Data_Original")
401
396
 
402
397
  # Clean cycles for rows and columns
@@ -425,12 +420,12 @@ def drop_macro(df: pd.DataFrame,
425
420
  # log final state + plot
426
421
  missing_data_final = show_null_columns(
427
422
  df=df_clean,
428
- plot_to_dir=log_directory,
423
+ plot_to_dir=full_path,
429
424
  plot_filename="Processed",
430
425
  use_all_columns=True
431
426
  )
432
427
  save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
433
- save_dir=log_directory,
428
+ save_dir=full_path,
434
429
  filename="Missing_Data_Processed")
435
430
 
436
431
  # return cleaned dataframe
@@ -477,9 +472,8 @@ def plot_value_distributions(
477
472
  df: pd.DataFrame,
478
473
  save_dir: Union[str, Path],
479
474
  categorical_columns: Optional[List[str]] = None,
480
- categorical_cardinality_threshold: int = 10,
481
- max_categories: int = 50,
482
- fill_na_with: str = "Missing"
475
+ max_categories: int = 100,
476
+ fill_na_with: str = "MISSING DATA"
483
477
  ):
484
478
  """
485
479
  Plots and saves the value distributions for all columns in a DataFrame,
@@ -492,15 +486,9 @@ def plot_value_distributions(
492
486
  Args:
493
487
  df (pd.DataFrame): The input DataFrame to analyze.
494
488
  save_dir (str | Path): Directory path to save the plots.
495
- categorical_columns (List[str] | None): If provided, this list
496
- of column names will be treated as categorical, and all other columns will be treated as continuous. This
497
- overrides the `continuous_cardinality_threshold` logic.
498
- categorical_cardinality_threshold (int): A numeric column will be treated
499
- as 'categorical' if its number of unique values is less than or equal to this threshold. (Ignored if `categorical_columns` is set).
500
- max_categories (int): The maximum number of unique categories a
501
- categorical feature can have to be plotted. Features exceeding this limit will be skipped.
502
- fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its
503
- own category. Defaults to "Missing".
489
+ categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
490
+ max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
491
+ fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
504
492
 
505
493
  Notes:
506
494
  - `seaborn.histplot` with KDE is used for continuous features.
@@ -535,7 +523,7 @@ def plot_value_distributions(
535
523
  is_continuous = True
536
524
  else:
537
525
  # Use auto-detection
538
- if is_numeric and n_unique > categorical_cardinality_threshold:
526
+ if is_numeric:
539
527
  is_continuous = True
540
528
 
541
529
  # --- Case 1: Continuous Numeric (Histogram) ---
@@ -550,7 +538,7 @@ def plot_value_distributions(
550
538
  save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
551
539
  numeric_plots_saved += 1
552
540
 
553
- # --- Case 2: Categorical or Low-Cardinality Numeric (Count Plot) ---
541
+ # --- Case 2: Categorical (Count Plot) ---
554
542
  else:
555
543
  # Check max categories
556
544
  if n_unique > max_categories:
@@ -559,7 +547,7 @@ def plot_value_distributions(
559
547
 
560
548
  # Adaptive figure size
561
549
  fig_width = max(10, n_unique * 0.5)
562
- plt.figure(figsize=(fig_width, 7))
550
+ plt.figure(figsize=(fig_width, 8))
563
551
 
564
552
  # Make a temporary copy for plotting to handle NaNs
565
553
  temp_series = df[col_name].copy()
@@ -574,7 +562,7 @@ def plot_value_distributions(
574
562
 
575
563
  # Get category order by frequency
576
564
  order = temp_series.value_counts().index
577
- sns.countplot(x=temp_series, order=order, palette="viridis")
565
+ sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
578
566
 
579
567
  plt.title(f"Distribution of '{col_name}' (Categorical)")
580
568
  plt.xlabel(col_name)
@@ -744,23 +732,23 @@ def plot_categorical_vs_target(
744
732
  targets: List[str],
745
733
  save_dir: Union[str, Path],
746
734
  features: Optional[List[str]] = None,
747
- plot_type: Literal["box", "violin"] = "box",
748
- max_categories: int = 20,
749
- fill_na_with: str = "Missing"
735
+ max_categories: int = 50,
736
+ fill_na_with: str = "MISSING DATA"
750
737
  ):
751
738
  """
752
- Plots each categorical feature against each numeric target using box or violin plots.
739
+ Plots each categorical feature against each numeric target using box plots.
753
740
 
754
741
  This function is a core EDA step for regression tasks to understand the
755
742
  relationship between a categorical independent variable and a continuous
756
743
  dependent variable.
744
+
745
+ Plots are saved as individual .svg files in a structured way, with a subdirectory created for each target.
757
746
 
758
747
  Args:
759
748
  df (pd.DataFrame): The input DataFrame.
760
749
  targets (List[str]): A list of numeric target column names (y-axis).
761
750
  save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
762
751
  features (List[str] | None): A list of categorical feature column names (x-axis). If None, all non-numeric (object) columns will be used.
763
- plot_type (Literal["box", "violin"]): The type of plot to generate.
764
752
  max_categories (int): The maximum number of unique categories a feature can have to be plotted. Features exceeding this limit will be skipped.
765
753
  fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category. Defaults to "Missing".
766
754
 
@@ -770,10 +758,6 @@ def plot_categorical_vs_target(
770
758
  """
771
759
  # 1. Validate the base save directory and inputs
772
760
  base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
773
-
774
- if plot_type not in ["box", "violin"]:
775
- _LOGGER.error(f"Invalid plot type '{plot_type}'")
776
- raise ValueError()
777
761
 
778
762
  # 2. Validate target columns (must be numeric)
779
763
  valid_targets = []
@@ -796,14 +780,10 @@ def plot_categorical_vs_target(
796
780
  for col in df.columns:
797
781
  if col in valid_targets:
798
782
  continue
799
-
800
783
  # Auto-include object dtypes
801
784
  if is_object_dtype(df[col]):
802
785
  features_to_plot.append(col)
803
- # Auto-include low-cardinality numeric features - REMOVED
804
- # elif is_numeric_dtype(df[col]) and df[col].nunique() <= max_categories:
805
- # _LOGGER.info(f"Treating low-cardinality numeric column '{col}' as categorical.")
806
- # features_to_plot.append(col)
786
+
807
787
  else:
808
788
  # Validate user-provided list
809
789
  for col in features:
@@ -821,12 +801,11 @@ def plot_categorical_vs_target(
821
801
 
822
802
  for target_name in valid_targets:
823
803
  # Create a sanitized subdirectory for this target
824
- safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical_{plot_type}")
804
+ safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
825
805
  target_save_dir = base_save_path / safe_target_dir_name
826
806
  target_save_dir.mkdir(parents=True, exist_ok=True)
827
807
 
828
- _LOGGER.info(f"Generating '{plot_type}' plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
829
-
808
+ _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
830
809
  for feature_name in features_to_plot:
831
810
 
832
811
  # Make a temporary copy for plotting to handle NaNs and dtypes
@@ -848,12 +827,9 @@ def plot_categorical_vs_target(
848
827
 
849
828
  # 5. Create the plot
850
829
  # Increase figure width for categories
851
- plt.figure(figsize=(max(10, n_unique * 1.2), 7))
830
+ plt.figure(figsize=(max(10, n_unique * 1.2), 10))
852
831
 
853
- if plot_type == "box":
854
- sns.boxplot(x=feature_name, y=target_name, data=temp_df)
855
- elif plot_type == "violin":
856
- sns.violinplot(x=feature_name, y=target_name, data=temp_df)
832
+ sns.boxplot(x=feature_name, y=target_name, data=temp_df)
857
833
 
858
834
  plt.title(f'{target_name} vs {feature_name}')
859
835
  plt.xlabel(feature_name)
@@ -981,7 +957,7 @@ def encode_categorical_features(
981
957
 
982
958
  # Handle the dataset splitting logic
983
959
  if split_resulting_dataset:
984
- df_categorical = df_encoded[valid_columns].to_frame() # type: ignore
960
+ df_categorical = df_encoded[valid_columns]
985
961
  df_non_categorical = df.drop(columns=valid_columns)
986
962
  return mappings, df_non_categorical, df_categorical
987
963
  else:
@@ -1167,7 +1143,7 @@ def clip_outliers_single(
1167
1143
 
1168
1144
  def clip_outliers_multi(
1169
1145
  df: pd.DataFrame,
1170
- clip_dict: Dict[str, Tuple[Union[int, float], Union[int, float]]],
1146
+ clip_dict: Union[Dict[str, Tuple[int, int]], Dict[str, Tuple[float, float]]],
1171
1147
  verbose: bool=False
1172
1148
  ) -> pd.DataFrame:
1173
1149
  """
@@ -396,7 +396,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
396
396
  if not isinstance(full_path, Path) or not full_path.suffix.endswith(".csv"):
397
397
  _LOGGER.error('A path object pointing to a .csv file must be provided.')
398
398
  raise ValueError()
399
-
399
+
400
400
  save_dataframe_filename(df=df,
401
401
  save_dir=full_path.parent,
402
402
  filename=full_path.name)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "19.12.1"
3
+ version = "19.13.0"
4
4
  description = "Complete pipelines and helper tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Luigi Loza Vidaurre", email = "luigiloza@gmail.com" }
@@ -37,18 +37,18 @@ __all__ = [
37
37
  "plot_value_distributions",
38
38
  "plot_continuous_vs_target",
39
39
  "plot_categorical_vs_target",
40
- "encode_categorical_features",
41
40
  "split_features_targets",
42
- "split_continuous_binary",
41
+ "encode_categorical_features",
43
42
  "clip_outliers_single",
44
43
  "clip_outliers_multi",
45
44
  "drop_outlier_samples",
46
45
  "plot_correlation_heatmap",
46
+ "finalize_feature_schema",
47
47
  "match_and_filter_columns_by_regex",
48
48
  "standardize_percentages",
49
49
  "reconstruct_one_hot",
50
50
  "reconstruct_binary",
51
51
  "reconstruct_multibinary",
52
- "finalize_feature_schema",
52
+ "split_continuous_binary",
53
53
  "apply_feature_schema",
54
54
  ]