dragon-ml-toolbox 19.12.2__py3-none-any.whl → 19.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/RECORD +7 -7
- ml_tools/_core/_data_exploration.py +152 -169
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 19.
|
|
3
|
+
Version: 19.14.0
|
|
4
4
|
Summary: Complete pipelines and helper tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
dragon_ml_toolbox-19.
|
|
2
|
-
dragon_ml_toolbox-19.
|
|
1
|
+
dragon_ml_toolbox-19.14.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-19.14.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
|
|
3
3
|
ml_tools/ETL_cleaning.py,sha256=cKXyRFaaFs_beAGDnQM54xnML671kq-yJEGjHafW-20,351
|
|
4
4
|
ml_tools/ETL_engineering.py,sha256=cwh1FhtNdUHllUDvho-x3SIVj4KwG_rFQR6VYzWUg0U,898
|
|
5
5
|
ml_tools/GUI_tools.py,sha256=O89rG8WQv6GY1DiphQjIsPzXFCQID6te7q_Sgt1iTkQ,294
|
|
@@ -88,7 +88,7 @@ ml_tools/_core/_PSO_optimization.py,sha256=W3g5xw2v2eOUQadv8KHFkt5HNm9AiY3ZUk-Te
|
|
|
88
88
|
ml_tools/_core/_SQL.py,sha256=zX_8EgYfmLmvvrnL851KMkI4w9kdkjHJ997BTvS5aig,11556
|
|
89
89
|
ml_tools/_core/_VIF_factor.py,sha256=BM0mTowBqt45PXFy9oJLhT9C-CTWWo0TQhgCyWYLHtQ,10457
|
|
90
90
|
ml_tools/_core/__init__.py,sha256=d4IG0OxUXj2HffepzQcYixHlZeuuuDMAFa09H_6LtmU,12
|
|
91
|
-
ml_tools/_core/_data_exploration.py,sha256=
|
|
91
|
+
ml_tools/_core/_data_exploration.py,sha256=tOdtXTCh_xESKqIUuxCCo8fbcPoO9Eu5PwJwyehGKY8,76434
|
|
92
92
|
ml_tools/_core/_ensemble_evaluation.py,sha256=17lWl4bWLT1BAMv_fhGf2D3wy-F4jx0HgnJ79lYkRuE,28419
|
|
93
93
|
ml_tools/_core/_ensemble_inference.py,sha256=9UpARSETzmqPdQmxqizD768tjkqldxHw1ER_hM9Kx9M,8631
|
|
94
94
|
ml_tools/_core/_ensemble_learning.py,sha256=X8ghbjDOLMENCWdISXLhDlHQtR3C6SW1tkTBAcfRRPY,22016
|
|
@@ -105,7 +105,7 @@ ml_tools/_core/_schema.py,sha256=TM5WVVMoKOvr_Bc2z34sU_gzKlM465PRKTgdZaEOkGY,140
|
|
|
105
105
|
ml_tools/_core/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
106
106
|
ml_tools/_core/_serde.py,sha256=tsI4EO2Y7jrBMmbQ1pinDsPOrOg-SaPuB-Dt40q0taE,5609
|
|
107
107
|
ml_tools/_core/_utilities.py,sha256=oU-0hBipE96bXox66NG-hFuEMMNkKa9MkAy1yJGCSIA,22779
|
|
108
|
-
dragon_ml_toolbox-19.
|
|
109
|
-
dragon_ml_toolbox-19.
|
|
110
|
-
dragon_ml_toolbox-19.
|
|
111
|
-
dragon_ml_toolbox-19.
|
|
108
|
+
dragon_ml_toolbox-19.14.0.dist-info/METADATA,sha256=7QaJsWeT9idUhpAV37t64fsuWNmzXbaQqA-a-yDP2yY,8193
|
|
109
|
+
dragon_ml_toolbox-19.14.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
110
|
+
dragon_ml_toolbox-19.14.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
111
|
+
dragon_ml_toolbox-19.14.0.dist-info/RECORD,,
|
|
@@ -26,21 +26,21 @@ __all__ = [
|
|
|
26
26
|
"drop_macro",
|
|
27
27
|
"clean_column_names",
|
|
28
28
|
"plot_value_distributions",
|
|
29
|
-
"plot_continuous_vs_target",
|
|
30
|
-
"plot_categorical_vs_target",
|
|
31
|
-
"encode_categorical_features",
|
|
32
29
|
"split_features_targets",
|
|
33
|
-
"
|
|
30
|
+
"encode_categorical_features",
|
|
34
31
|
"clip_outliers_single",
|
|
35
32
|
"clip_outliers_multi",
|
|
36
33
|
"drop_outlier_samples",
|
|
34
|
+
"plot_continuous_vs_target",
|
|
35
|
+
"plot_categorical_vs_target",
|
|
37
36
|
"plot_correlation_heatmap",
|
|
37
|
+
"finalize_feature_schema",
|
|
38
38
|
"match_and_filter_columns_by_regex",
|
|
39
39
|
"standardize_percentages",
|
|
40
40
|
"reconstruct_one_hot",
|
|
41
41
|
"reconstruct_binary",
|
|
42
42
|
"reconstruct_multibinary",
|
|
43
|
-
"
|
|
43
|
+
"split_continuous_binary",
|
|
44
44
|
"apply_feature_schema"
|
|
45
45
|
]
|
|
46
46
|
|
|
@@ -59,16 +59,18 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
|
59
59
|
"""
|
|
60
60
|
summary = pd.DataFrame({
|
|
61
61
|
'Data Type': df.dtypes,
|
|
62
|
-
'
|
|
62
|
+
'Completeness %': (df.notnull().mean() * 100).round(2),
|
|
63
63
|
'Unique Values': df.nunique(),
|
|
64
|
-
'Missing %': (df.isnull().mean() * 100).round(
|
|
64
|
+
# 'Missing %': (df.isnull().mean() * 100).round(2)
|
|
65
65
|
})
|
|
66
66
|
|
|
67
67
|
# For numeric columns, add summary statistics
|
|
68
68
|
numeric_cols = df.select_dtypes(include='number').columns
|
|
69
69
|
if not numeric_cols.empty:
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
stats = df[numeric_cols].describe(percentiles=[.10, .25, .50, .70, .80, .90])
|
|
71
|
+
|
|
72
|
+
summary_numeric = stats.T[
|
|
73
|
+
['mean', 'std', 'min', '10%', '25%', '50%', '70%', '80%', '90%', 'max']
|
|
72
74
|
].round(round_digits)
|
|
73
75
|
summary = summary.join(summary_numeric, how='left')
|
|
74
76
|
|
|
@@ -108,22 +110,17 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
|
|
|
108
110
|
for col_name in df_clean.columns:
|
|
109
111
|
column = df_clean[col_name]
|
|
110
112
|
|
|
111
|
-
# We can apply this logic to all columns or only focus on numeric ones.
|
|
112
|
-
# if not is_numeric_dtype(column):
|
|
113
|
-
# cols_to_keep.append(col_name)
|
|
114
|
-
# continue
|
|
115
|
-
|
|
116
113
|
# Keep a column if it has more than one unique value (nunique ignores NaNs by default)
|
|
117
114
|
if column.nunique(dropna=True) > 1:
|
|
118
115
|
cols_to_keep.append(col_name)
|
|
119
116
|
|
|
120
117
|
dropped_columns = original_columns - set(cols_to_keep)
|
|
121
118
|
if verbose:
|
|
122
|
-
_LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns.")
|
|
123
119
|
if dropped_columns:
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
120
|
+
_LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
|
|
121
|
+
else:
|
|
122
|
+
_LOGGER.info("No constant columns found.")
|
|
123
|
+
|
|
127
124
|
# Return a new DataFrame with only the columns to keep
|
|
128
125
|
df_clean = df_clean[cols_to_keep]
|
|
129
126
|
|
|
@@ -338,8 +335,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
338
335
|
cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
|
|
339
336
|
|
|
340
337
|
if len(cols_to_drop) > 0:
|
|
341
|
-
_LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
342
|
-
print(list(cols_to_drop))
|
|
338
|
+
_LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
|
|
343
339
|
|
|
344
340
|
result_df = df.drop(columns=cols_to_drop)
|
|
345
341
|
if show_nulls_after:
|
|
@@ -369,9 +365,8 @@ def drop_macro(df: pd.DataFrame,
|
|
|
369
365
|
|
|
370
366
|
Args:
|
|
371
367
|
df (pd.DataFrame): The input pandas DataFrame to be cleaned.
|
|
372
|
-
log_directory (Union[str, Path]): Path to the directory where the
|
|
373
|
-
|
|
374
|
-
will be saved.
|
|
368
|
+
log_directory (Union[str, Path]): Path to the directory where the missing data reports
|
|
369
|
+
and plots will be saved inside a "Missing Report" subdirectory.
|
|
375
370
|
targets (list[str]): A list of column names to be treated as target
|
|
376
371
|
variables. This list guides the row-dropping logic.
|
|
377
372
|
skip_targets (bool, optional): If True, the columns listed in `targets`
|
|
@@ -387,15 +382,18 @@ def drop_macro(df: pd.DataFrame,
|
|
|
387
382
|
# make a deep copy to work with
|
|
388
383
|
df_clean = df.copy()
|
|
389
384
|
|
|
385
|
+
base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
|
|
386
|
+
full_path = base_dir_path / "Missing Report"
|
|
387
|
+
|
|
390
388
|
# Log initial state + Plot
|
|
391
389
|
missing_data_start = show_null_columns(
|
|
392
390
|
df=df_clean,
|
|
393
|
-
plot_to_dir=
|
|
391
|
+
plot_to_dir=full_path,
|
|
394
392
|
plot_filename="Original",
|
|
395
393
|
use_all_columns=True
|
|
396
394
|
)
|
|
397
395
|
save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
|
|
398
|
-
save_dir=
|
|
396
|
+
save_dir=full_path,
|
|
399
397
|
filename="Missing_Data_Original")
|
|
400
398
|
|
|
401
399
|
# Clean cycles for rows and columns
|
|
@@ -424,12 +422,12 @@ def drop_macro(df: pd.DataFrame,
|
|
|
424
422
|
# log final state + plot
|
|
425
423
|
missing_data_final = show_null_columns(
|
|
426
424
|
df=df_clean,
|
|
427
|
-
plot_to_dir=
|
|
425
|
+
plot_to_dir=full_path,
|
|
428
426
|
plot_filename="Processed",
|
|
429
427
|
use_all_columns=True
|
|
430
428
|
)
|
|
431
429
|
save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
|
|
432
|
-
save_dir=
|
|
430
|
+
save_dir=full_path,
|
|
433
431
|
filename="Missing_Data_Processed")
|
|
434
432
|
|
|
435
433
|
# return cleaned dataframe
|
|
@@ -476,9 +474,8 @@ def plot_value_distributions(
|
|
|
476
474
|
df: pd.DataFrame,
|
|
477
475
|
save_dir: Union[str, Path],
|
|
478
476
|
categorical_columns: Optional[List[str]] = None,
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
fill_na_with: str = "Missing"
|
|
477
|
+
max_categories: int = 100,
|
|
478
|
+
fill_na_with: str = "MISSING DATA"
|
|
482
479
|
):
|
|
483
480
|
"""
|
|
484
481
|
Plots and saves the value distributions for all columns in a DataFrame,
|
|
@@ -491,15 +488,9 @@ def plot_value_distributions(
|
|
|
491
488
|
Args:
|
|
492
489
|
df (pd.DataFrame): The input DataFrame to analyze.
|
|
493
490
|
save_dir (str | Path): Directory path to save the plots.
|
|
494
|
-
categorical_columns (List[str] | None): If provided,
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
categorical_cardinality_threshold (int): A numeric column will be treated
|
|
498
|
-
as 'categorical' if its number of unique values is less than or equal to this threshold. (Ignored if `categorical_columns` is set).
|
|
499
|
-
max_categories (int): The maximum number of unique categories a
|
|
500
|
-
categorical feature can have to be plotted. Features exceeding this limit will be skipped.
|
|
501
|
-
fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its
|
|
502
|
-
own category. Defaults to "Missing".
|
|
491
|
+
categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
|
|
492
|
+
max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
|
|
493
|
+
fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
|
|
503
494
|
|
|
504
495
|
Notes:
|
|
505
496
|
- `seaborn.histplot` with KDE is used for continuous features.
|
|
@@ -534,7 +525,7 @@ def plot_value_distributions(
|
|
|
534
525
|
is_continuous = True
|
|
535
526
|
else:
|
|
536
527
|
# Use auto-detection
|
|
537
|
-
if is_numeric
|
|
528
|
+
if is_numeric:
|
|
538
529
|
is_continuous = True
|
|
539
530
|
|
|
540
531
|
# --- Case 1: Continuous Numeric (Histogram) ---
|
|
@@ -549,7 +540,7 @@ def plot_value_distributions(
|
|
|
549
540
|
save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
|
|
550
541
|
numeric_plots_saved += 1
|
|
551
542
|
|
|
552
|
-
# --- Case 2: Categorical
|
|
543
|
+
# --- Case 2: Categorical (Count Plot) ---
|
|
553
544
|
else:
|
|
554
545
|
# Check max categories
|
|
555
546
|
if n_unique > max_categories:
|
|
@@ -558,7 +549,7 @@ def plot_value_distributions(
|
|
|
558
549
|
|
|
559
550
|
# Adaptive figure size
|
|
560
551
|
fig_width = max(10, n_unique * 0.5)
|
|
561
|
-
plt.figure(figsize=(fig_width,
|
|
552
|
+
plt.figure(figsize=(fig_width, 8))
|
|
562
553
|
|
|
563
554
|
# Make a temporary copy for plotting to handle NaNs
|
|
564
555
|
temp_series = df[col_name].copy()
|
|
@@ -573,7 +564,7 @@ def plot_value_distributions(
|
|
|
573
564
|
|
|
574
565
|
# Get category order by frequency
|
|
575
566
|
order = temp_series.value_counts().index
|
|
576
|
-
sns.countplot(x=temp_series, order=order, palette="
|
|
567
|
+
sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
|
|
577
568
|
|
|
578
569
|
plt.title(f"Distribution of '{col_name}' (Categorical)")
|
|
579
570
|
plt.xlabel(col_name)
|
|
@@ -607,68 +598,55 @@ def plot_value_distributions(
|
|
|
607
598
|
|
|
608
599
|
|
|
609
600
|
def plot_continuous_vs_target(
|
|
610
|
-
|
|
611
|
-
|
|
601
|
+
df_continuous: pd.DataFrame,
|
|
602
|
+
df_targets: pd.DataFrame,
|
|
612
603
|
save_dir: Union[str, Path],
|
|
613
|
-
|
|
604
|
+
verbose: int = 1
|
|
614
605
|
):
|
|
615
606
|
"""
|
|
616
|
-
Plots each continuous feature against each target
|
|
607
|
+
Plots each continuous feature from df_continuous against each target in df_targets.
|
|
617
608
|
|
|
618
|
-
This function
|
|
619
|
-
|
|
620
|
-
regression line, and saves each plot as an individual .svg file.
|
|
609
|
+
This function creates a scatter plot for each feature-target pair, overlays a
|
|
610
|
+
simple linear regression line, and saves each plot as an individual .svg file.
|
|
621
611
|
|
|
622
612
|
Plots are saved in a structured way, with a subdirectory created for
|
|
623
613
|
each target variable.
|
|
624
614
|
|
|
625
615
|
Args:
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
save_dir (str | Path): The base directory where plots will be saved.
|
|
629
|
-
|
|
630
|
-
DataFrame will be used.
|
|
616
|
+
df_continuous (pd.DataFrame): DataFrame containing continuous feature columns (x-axis).
|
|
617
|
+
df_targets (pd.DataFrame): DataFrame containing target columns (y-axis).
|
|
618
|
+
save_dir (str | Path): The base directory where plots will be saved.
|
|
619
|
+
verbose (int): Verbosity level for logging warnings.
|
|
631
620
|
|
|
632
621
|
Notes:
|
|
633
|
-
- Only numeric features and numeric targets are processed.
|
|
634
|
-
|
|
635
|
-
-
|
|
636
|
-
pairwise for each plot.
|
|
622
|
+
- Only numeric features and numeric targets are processed.
|
|
623
|
+
- Rows with NaN in either the feature or the target are dropped pairwise.
|
|
624
|
+
- Assumes df_continuous and df_targets share the same index.
|
|
637
625
|
"""
|
|
638
626
|
# 1. Validate the base save directory
|
|
639
627
|
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
640
628
|
|
|
641
|
-
# 2.
|
|
642
|
-
def
|
|
629
|
+
# 2. Validation helper
|
|
630
|
+
def _get_valid_numeric_cols(df: pd.DataFrame, df_name: str) -> List[str]:
|
|
643
631
|
valid_cols = []
|
|
644
|
-
for col in
|
|
645
|
-
if
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
_LOGGER.warning(f"{col_type} column '{col}' is not numeric. Skipping.")
|
|
632
|
+
for col in df.columns:
|
|
633
|
+
if not is_numeric_dtype(df[col]):
|
|
634
|
+
if verbose > 0:
|
|
635
|
+
_LOGGER.warning(f"Column '{col}' in {df_name} is not numeric. Skipping.")
|
|
649
636
|
else:
|
|
650
637
|
valid_cols.append(col)
|
|
651
638
|
return valid_cols
|
|
652
639
|
|
|
653
|
-
# 3. Validate target columns
|
|
654
|
-
valid_targets =
|
|
640
|
+
# 3. Validate target columns
|
|
641
|
+
valid_targets = _get_valid_numeric_cols(df_targets, "df_targets")
|
|
655
642
|
if not valid_targets:
|
|
656
|
-
_LOGGER.error("No valid numeric target columns provided
|
|
643
|
+
_LOGGER.error("No valid numeric target columns provided in df_targets.")
|
|
657
644
|
return
|
|
658
645
|
|
|
659
|
-
# 4.
|
|
660
|
-
|
|
661
|
-
_LOGGER.info("No 'features' list provided. Using all non-target columns as features.")
|
|
662
|
-
target_set = set(valid_targets)
|
|
663
|
-
# Get all columns that are not in the valid_targets set
|
|
664
|
-
features_to_validate = [col for col in df.columns if col not in target_set]
|
|
665
|
-
else:
|
|
666
|
-
features_to_validate = features
|
|
667
|
-
|
|
668
|
-
valid_features = _validate_numeric_cols(features_to_validate, "Feature")
|
|
669
|
-
|
|
646
|
+
# 4. Validate feature columns
|
|
647
|
+
valid_features = _get_valid_numeric_cols(df_continuous, "df_continuous")
|
|
670
648
|
if not valid_features:
|
|
671
|
-
_LOGGER.error("No valid numeric feature columns
|
|
649
|
+
_LOGGER.error("No valid numeric feature columns provided in df_continuous.")
|
|
672
650
|
return
|
|
673
651
|
|
|
674
652
|
# 5. Main plotting loop
|
|
@@ -680,15 +658,20 @@ def plot_continuous_vs_target(
|
|
|
680
658
|
target_save_dir = base_save_path / safe_target_dir_name
|
|
681
659
|
target_save_dir.mkdir(parents=True, exist_ok=True)
|
|
682
660
|
|
|
683
|
-
|
|
661
|
+
if verbose > 0:
|
|
662
|
+
_LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
|
|
684
663
|
|
|
685
664
|
for feature_name in valid_features:
|
|
686
665
|
|
|
687
|
-
#
|
|
688
|
-
temp_df =
|
|
666
|
+
# Align data and drop NaNs pairwise - use concat to ensure we respect the index alignment between the two DFs
|
|
667
|
+
temp_df = pd.concat([
|
|
668
|
+
df_continuous[feature_name],
|
|
669
|
+
df_targets[target_name]
|
|
670
|
+
], axis=1).dropna()
|
|
689
671
|
|
|
690
672
|
if temp_df.empty:
|
|
691
|
-
|
|
673
|
+
if verbose > 1:
|
|
674
|
+
_LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
|
|
692
675
|
continue
|
|
693
676
|
|
|
694
677
|
x = temp_df[feature_name]
|
|
@@ -696,11 +679,12 @@ def plot_continuous_vs_target(
|
|
|
696
679
|
|
|
697
680
|
# 6. Perform linear fit
|
|
698
681
|
try:
|
|
699
|
-
# Modern replacement for np.polyfit + np.poly1d
|
|
682
|
+
# Modern replacement for np.polyfit + np.poly1d
|
|
700
683
|
p = np.polynomial.Polynomial.fit(x, y, deg=1)
|
|
701
684
|
plot_regression_line = True
|
|
702
685
|
except (np.linalg.LinAlgError, ValueError):
|
|
703
|
-
|
|
686
|
+
if verbose > 0:
|
|
687
|
+
_LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
|
|
704
688
|
plot_regression_line = False
|
|
705
689
|
|
|
706
690
|
# 7. Create the plot
|
|
@@ -734,87 +718,68 @@ def plot_continuous_vs_target(
|
|
|
734
718
|
|
|
735
719
|
# Close the figure to free up memory
|
|
736
720
|
plt.close()
|
|
737
|
-
|
|
738
|
-
|
|
721
|
+
|
|
722
|
+
if verbose > 0:
|
|
723
|
+
_LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
|
|
739
724
|
|
|
740
725
|
|
|
741
726
|
def plot_categorical_vs_target(
|
|
742
|
-
|
|
743
|
-
|
|
727
|
+
df_categorical: pd.DataFrame,
|
|
728
|
+
df_targets: pd.DataFrame,
|
|
744
729
|
save_dir: Union[str, Path],
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
730
|
+
max_categories: int = 50,
|
|
731
|
+
fill_na_with: str = "MISSING DATA",
|
|
732
|
+
drop_empty_targets: bool = True,
|
|
733
|
+
verbose: int = 1
|
|
749
734
|
):
|
|
750
735
|
"""
|
|
751
|
-
Plots each
|
|
736
|
+
Plots each feature in df_categorical against each numeric target in df_targets using box plots.
|
|
752
737
|
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
dependent variable.
|
|
756
|
-
|
|
757
|
-
Plots are saved as individual .svg files in a structured way, with a subdirectory created for each target.
|
|
738
|
+
Automatically aligns the two DataFrames by index. If a numeric
|
|
739
|
+
column is passed within df_categorical, it will be cast to object type to treat it as a category.
|
|
758
740
|
|
|
759
741
|
Args:
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
save_dir (str | Path):
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
742
|
+
df_categorical (pd.DataFrame): DataFrame containing categorical feature columns (x-axis).
|
|
743
|
+
df_targets (pd.DataFrame): DataFrame containing numeric target columns (y-axis).
|
|
744
|
+
save_dir (str | Path): Base directory for saving plots.
|
|
745
|
+
max_categories (int): The maximum number of unique categories a feature can have to be plotted.
|
|
746
|
+
fill_na_with (str): String to replace NaN values in categorical columns.
|
|
747
|
+
drop_empty_targets (bool): If True, drops rows where the target value is NaN before plotting.
|
|
748
|
+
verbose (int): Verbosity level for logging warnings.
|
|
767
749
|
|
|
768
750
|
Notes:
|
|
769
|
-
-
|
|
770
|
-
- Features are automatically identified as categorical if they are 'object' dtype.
|
|
751
|
+
- Assumes df_categorical and df_targets share the same index.
|
|
771
752
|
"""
|
|
772
|
-
# 1. Validate the base save directory
|
|
753
|
+
# 1. Validate the base save directory
|
|
773
754
|
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
774
|
-
|
|
775
|
-
if plot_type not in ["box", "violin"]:
|
|
776
|
-
_LOGGER.error(f"Invalid plot type '{plot_type}'")
|
|
777
|
-
raise ValueError()
|
|
778
755
|
|
|
779
756
|
# 2. Validate target columns (must be numeric)
|
|
780
757
|
valid_targets = []
|
|
781
|
-
for col in
|
|
782
|
-
if
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
_LOGGER.warning(f"Target column '{col}' is not numeric. Skipping.")
|
|
758
|
+
for col in df_targets.columns:
|
|
759
|
+
if not is_numeric_dtype(df_targets[col]):
|
|
760
|
+
if verbose > 0:
|
|
761
|
+
_LOGGER.warning(f"Target column '{col}' in df_targets is not numeric. Skipping.")
|
|
786
762
|
else:
|
|
787
763
|
valid_targets.append(col)
|
|
788
764
|
|
|
789
765
|
if not valid_targets:
|
|
790
|
-
_LOGGER.error("No valid numeric target columns provided
|
|
766
|
+
_LOGGER.error("No valid numeric target columns provided in df_targets.")
|
|
791
767
|
return
|
|
792
768
|
|
|
793
|
-
# 3.
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
if
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
# Auto-include low-cardinality numeric features - REMOVED
|
|
805
|
-
# elif is_numeric_dtype(df[col]) and df[col].nunique() <= max_categories:
|
|
806
|
-
# _LOGGER.info(f"Treating low-cardinality numeric column '{col}' as categorical.")
|
|
807
|
-
# features_to_plot.append(col)
|
|
808
|
-
else:
|
|
809
|
-
# Validate user-provided list
|
|
810
|
-
for col in features:
|
|
811
|
-
if col not in df.columns:
|
|
812
|
-
_LOGGER.warning(f"Feature column '{col}' not found in DataFrame. Skipping.")
|
|
813
|
-
else:
|
|
814
|
-
features_to_plot.append(col)
|
|
769
|
+
# 3. Validate feature columns (Flexible: Allow numeric but warn)
|
|
770
|
+
valid_features = []
|
|
771
|
+
for col in df_categorical.columns:
|
|
772
|
+
# If numeric, warn but accept it (will be cast to object later)
|
|
773
|
+
if is_numeric_dtype(df_categorical[col]):
|
|
774
|
+
if verbose > 0:
|
|
775
|
+
_LOGGER.warning(f"Feature '{col}' in df_categorical is numeric. It will be cast to 'object' and treated as categorical.")
|
|
776
|
+
valid_features.append(col)
|
|
777
|
+
else:
|
|
778
|
+
# Assume it is already object/category
|
|
779
|
+
valid_features.append(col)
|
|
815
780
|
|
|
816
|
-
if not
|
|
817
|
-
_LOGGER.error("No valid
|
|
781
|
+
if not valid_features:
|
|
782
|
+
_LOGGER.error("No valid feature columns provided in df_categorical.")
|
|
818
783
|
return
|
|
819
784
|
|
|
820
785
|
# 4. Main plotting loop
|
|
@@ -822,39 +787,53 @@ def plot_categorical_vs_target(
|
|
|
822
787
|
|
|
823
788
|
for target_name in valid_targets:
|
|
824
789
|
# Create a sanitized subdirectory for this target
|
|
825
|
-
safe_target_dir_name = sanitize_filename(f"{target_name}
|
|
790
|
+
safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
|
|
826
791
|
target_save_dir = base_save_path / safe_target_dir_name
|
|
827
792
|
target_save_dir.mkdir(parents=True, exist_ok=True)
|
|
828
793
|
|
|
829
|
-
|
|
794
|
+
if verbose > 0:
|
|
795
|
+
_LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
|
|
796
|
+
|
|
797
|
+
for feature_name in valid_features:
|
|
798
|
+
|
|
799
|
+
# Align data using concat to respect indices
|
|
800
|
+
feature_series = df_categorical[feature_name]
|
|
801
|
+
target_series = df_targets[target_name]
|
|
802
|
+
|
|
803
|
+
# Create a temporary DataFrame for this pair
|
|
804
|
+
temp_df = pd.concat([feature_series, target_series], axis=1)
|
|
805
|
+
|
|
806
|
+
# Optional: Drop rows where the target is NaN
|
|
807
|
+
if drop_empty_targets:
|
|
808
|
+
temp_df = temp_df.dropna(subset=[target_name])
|
|
809
|
+
if temp_df.empty:
|
|
810
|
+
if verbose > 1:
|
|
811
|
+
_LOGGER.warning(f"No valid data left for '{feature_name}' vs '{target_name}' after dropping empty targets. Skipping.")
|
|
812
|
+
continue
|
|
830
813
|
|
|
831
|
-
|
|
814
|
+
# Force feature to object if it isn't already (handling the numeric flexibility)
|
|
815
|
+
if not is_object_dtype(temp_df[feature_name]):
|
|
816
|
+
temp_df[feature_name] = temp_df[feature_name].astype(object)
|
|
817
|
+
|
|
818
|
+
# Handle NaNs in the feature column (treat as a category)
|
|
819
|
+
if temp_df[feature_name].isnull().any():
|
|
820
|
+
temp_df[feature_name] = temp_df[feature_name].fillna(fill_na_with)
|
|
832
821
|
|
|
833
|
-
#
|
|
834
|
-
temp_df =
|
|
822
|
+
# Convert to string to ensure consistent plotting and cardinality check
|
|
823
|
+
temp_df[feature_name] = temp_df[feature_name].astype(str)
|
|
835
824
|
|
|
836
825
|
# Check cardinality
|
|
837
826
|
n_unique = temp_df[feature_name].nunique()
|
|
838
827
|
if n_unique > max_categories:
|
|
839
|
-
|
|
828
|
+
if verbose > 1:
|
|
829
|
+
_LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique categories > {max_categories} max_categories.")
|
|
840
830
|
continue
|
|
841
|
-
|
|
842
|
-
# Handle NaNs by replacing them with the specified string
|
|
843
|
-
if temp_df[feature_name].isnull().any():
|
|
844
|
-
# Convert to object type first to allow string replacement
|
|
845
|
-
temp_df[feature_name] = temp_df[feature_name].astype(object).fillna(fill_na_with)
|
|
846
|
-
|
|
847
|
-
# Convert feature to string to ensure correct plotting order
|
|
848
|
-
temp_df[feature_name] = temp_df[feature_name].astype(str)
|
|
849
831
|
|
|
850
832
|
# 5. Create the plot
|
|
851
|
-
#
|
|
852
|
-
plt.figure(figsize=(max(10, n_unique *
|
|
833
|
+
# Dynamic figure width based on number of categories
|
|
834
|
+
plt.figure(figsize=(max(10, n_unique * 0.8), 10))
|
|
853
835
|
|
|
854
|
-
|
|
855
|
-
sns.boxplot(x=feature_name, y=target_name, data=temp_df)
|
|
856
|
-
elif plot_type == "violin":
|
|
857
|
-
sns.violinplot(x=feature_name, y=target_name, data=temp_df)
|
|
836
|
+
sns.boxplot(x=feature_name, y=target_name, data=temp_df)
|
|
858
837
|
|
|
859
838
|
plt.title(f'{target_name} vs {feature_name}')
|
|
860
839
|
plt.xlabel(feature_name)
|
|
@@ -875,8 +854,9 @@ def plot_categorical_vs_target(
|
|
|
875
854
|
_LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
|
|
876
855
|
|
|
877
856
|
plt.close()
|
|
878
|
-
|
|
879
|
-
|
|
857
|
+
|
|
858
|
+
if verbose > 0:
|
|
859
|
+
_LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
|
|
880
860
|
|
|
881
861
|
|
|
882
862
|
def encode_categorical_features(
|
|
@@ -982,7 +962,7 @@ def encode_categorical_features(
|
|
|
982
962
|
|
|
983
963
|
# Handle the dataset splitting logic
|
|
984
964
|
if split_resulting_dataset:
|
|
985
|
-
df_categorical = df_encoded[valid_columns]
|
|
965
|
+
df_categorical = df_encoded[valid_columns]
|
|
986
966
|
df_non_categorical = df.drop(columns=valid_columns)
|
|
987
967
|
return mappings, df_non_categorical, df_categorical
|
|
988
968
|
else:
|
|
@@ -1103,7 +1083,10 @@ def plot_correlation_heatmap(df: pd.DataFrame,
|
|
|
1103
1083
|
annot=annot_bool,
|
|
1104
1084
|
cmap='coolwarm',
|
|
1105
1085
|
fmt=".2f",
|
|
1106
|
-
cbar_kws={"shrink": 0.8}
|
|
1086
|
+
cbar_kws={"shrink": 0.8},
|
|
1087
|
+
vmin=-1, # Anchors minimum color to -1
|
|
1088
|
+
vmax=1, # Anchors maximum color to 1
|
|
1089
|
+
center=0 # Ensures 0 corresponds to the neutral color (white)
|
|
1107
1090
|
)
|
|
1108
1091
|
|
|
1109
1092
|
# add suffix to title
|
|
File without changes
|
{dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|