dragon-ml-toolbox 19.12.2__py3-none-any.whl → 19.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.13.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.13.0.dist-info}/RECORD +7 -7
- ml_tools/_core/_data_exploration.py +35 -60
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.13.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.13.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.13.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.13.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 19.
|
|
3
|
+
Version: 19.13.0
|
|
4
4
|
Summary: Complete pipelines and helper tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
dragon_ml_toolbox-19.
|
|
2
|
-
dragon_ml_toolbox-19.
|
|
1
|
+
dragon_ml_toolbox-19.13.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-19.13.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
|
|
3
3
|
ml_tools/ETL_cleaning.py,sha256=cKXyRFaaFs_beAGDnQM54xnML671kq-yJEGjHafW-20,351
|
|
4
4
|
ml_tools/ETL_engineering.py,sha256=cwh1FhtNdUHllUDvho-x3SIVj4KwG_rFQR6VYzWUg0U,898
|
|
5
5
|
ml_tools/GUI_tools.py,sha256=O89rG8WQv6GY1DiphQjIsPzXFCQID6te7q_Sgt1iTkQ,294
|
|
@@ -88,7 +88,7 @@ ml_tools/_core/_PSO_optimization.py,sha256=W3g5xw2v2eOUQadv8KHFkt5HNm9AiY3ZUk-Te
|
|
|
88
88
|
ml_tools/_core/_SQL.py,sha256=zX_8EgYfmLmvvrnL851KMkI4w9kdkjHJ997BTvS5aig,11556
|
|
89
89
|
ml_tools/_core/_VIF_factor.py,sha256=BM0mTowBqt45PXFy9oJLhT9C-CTWWo0TQhgCyWYLHtQ,10457
|
|
90
90
|
ml_tools/_core/__init__.py,sha256=d4IG0OxUXj2HffepzQcYixHlZeuuuDMAFa09H_6LtmU,12
|
|
91
|
-
ml_tools/_core/_data_exploration.py,sha256=
|
|
91
|
+
ml_tools/_core/_data_exploration.py,sha256=VPSqTo8IPLDOGcVDAcdyxgzO0Fw224pbivzbli_aad0,76159
|
|
92
92
|
ml_tools/_core/_ensemble_evaluation.py,sha256=17lWl4bWLT1BAMv_fhGf2D3wy-F4jx0HgnJ79lYkRuE,28419
|
|
93
93
|
ml_tools/_core/_ensemble_inference.py,sha256=9UpARSETzmqPdQmxqizD768tjkqldxHw1ER_hM9Kx9M,8631
|
|
94
94
|
ml_tools/_core/_ensemble_learning.py,sha256=X8ghbjDOLMENCWdISXLhDlHQtR3C6SW1tkTBAcfRRPY,22016
|
|
@@ -105,7 +105,7 @@ ml_tools/_core/_schema.py,sha256=TM5WVVMoKOvr_Bc2z34sU_gzKlM465PRKTgdZaEOkGY,140
|
|
|
105
105
|
ml_tools/_core/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
106
106
|
ml_tools/_core/_serde.py,sha256=tsI4EO2Y7jrBMmbQ1pinDsPOrOg-SaPuB-Dt40q0taE,5609
|
|
107
107
|
ml_tools/_core/_utilities.py,sha256=oU-0hBipE96bXox66NG-hFuEMMNkKa9MkAy1yJGCSIA,22779
|
|
108
|
-
dragon_ml_toolbox-19.
|
|
109
|
-
dragon_ml_toolbox-19.
|
|
110
|
-
dragon_ml_toolbox-19.
|
|
111
|
-
dragon_ml_toolbox-19.
|
|
108
|
+
dragon_ml_toolbox-19.13.0.dist-info/METADATA,sha256=349zn3DuPgY4UmlKJ7YuI1lNhGCXnYFYe4zo63mDkbE,8193
|
|
109
|
+
dragon_ml_toolbox-19.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
110
|
+
dragon_ml_toolbox-19.13.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
111
|
+
dragon_ml_toolbox-19.13.0.dist-info/RECORD,,
|
|
@@ -28,19 +28,19 @@ __all__ = [
|
|
|
28
28
|
"plot_value_distributions",
|
|
29
29
|
"plot_continuous_vs_target",
|
|
30
30
|
"plot_categorical_vs_target",
|
|
31
|
-
"encode_categorical_features",
|
|
32
31
|
"split_features_targets",
|
|
33
|
-
"
|
|
32
|
+
"encode_categorical_features",
|
|
34
33
|
"clip_outliers_single",
|
|
35
34
|
"clip_outliers_multi",
|
|
36
35
|
"drop_outlier_samples",
|
|
37
36
|
"plot_correlation_heatmap",
|
|
37
|
+
"finalize_feature_schema",
|
|
38
38
|
"match_and_filter_columns_by_regex",
|
|
39
39
|
"standardize_percentages",
|
|
40
40
|
"reconstruct_one_hot",
|
|
41
41
|
"reconstruct_binary",
|
|
42
42
|
"reconstruct_multibinary",
|
|
43
|
-
"
|
|
43
|
+
"split_continuous_binary",
|
|
44
44
|
"apply_feature_schema"
|
|
45
45
|
]
|
|
46
46
|
|
|
@@ -108,22 +108,17 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
|
|
|
108
108
|
for col_name in df_clean.columns:
|
|
109
109
|
column = df_clean[col_name]
|
|
110
110
|
|
|
111
|
-
# We can apply this logic to all columns or only focus on numeric ones.
|
|
112
|
-
# if not is_numeric_dtype(column):
|
|
113
|
-
# cols_to_keep.append(col_name)
|
|
114
|
-
# continue
|
|
115
|
-
|
|
116
111
|
# Keep a column if it has more than one unique value (nunique ignores NaNs by default)
|
|
117
112
|
if column.nunique(dropna=True) > 1:
|
|
118
113
|
cols_to_keep.append(col_name)
|
|
119
114
|
|
|
120
115
|
dropped_columns = original_columns - set(cols_to_keep)
|
|
121
116
|
if verbose:
|
|
122
|
-
_LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns.")
|
|
123
117
|
if dropped_columns:
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
118
|
+
_LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
|
|
119
|
+
else:
|
|
120
|
+
_LOGGER.info("No constant columns found.")
|
|
121
|
+
|
|
127
122
|
# Return a new DataFrame with only the columns to keep
|
|
128
123
|
df_clean = df_clean[cols_to_keep]
|
|
129
124
|
|
|
@@ -338,8 +333,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
338
333
|
cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
|
|
339
334
|
|
|
340
335
|
if len(cols_to_drop) > 0:
|
|
341
|
-
_LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
342
|
-
print(list(cols_to_drop))
|
|
336
|
+
_LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
|
|
343
337
|
|
|
344
338
|
result_df = df.drop(columns=cols_to_drop)
|
|
345
339
|
if show_nulls_after:
|
|
@@ -369,9 +363,8 @@ def drop_macro(df: pd.DataFrame,
|
|
|
369
363
|
|
|
370
364
|
Args:
|
|
371
365
|
df (pd.DataFrame): The input pandas DataFrame to be cleaned.
|
|
372
|
-
log_directory (Union[str, Path]): Path to the directory where the
|
|
373
|
-
|
|
374
|
-
will be saved.
|
|
366
|
+
log_directory (Union[str, Path]): Path to the directory where the missing data reports
|
|
367
|
+
and plots will be saved inside a "Missing Report" subdirectory.
|
|
375
368
|
targets (list[str]): A list of column names to be treated as target
|
|
376
369
|
variables. This list guides the row-dropping logic.
|
|
377
370
|
skip_targets (bool, optional): If True, the columns listed in `targets`
|
|
@@ -387,15 +380,18 @@ def drop_macro(df: pd.DataFrame,
|
|
|
387
380
|
# make a deep copy to work with
|
|
388
381
|
df_clean = df.copy()
|
|
389
382
|
|
|
383
|
+
base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
|
|
384
|
+
full_path = base_dir_path / "Missing Report"
|
|
385
|
+
|
|
390
386
|
# Log initial state + Plot
|
|
391
387
|
missing_data_start = show_null_columns(
|
|
392
388
|
df=df_clean,
|
|
393
|
-
plot_to_dir=
|
|
389
|
+
plot_to_dir=full_path,
|
|
394
390
|
plot_filename="Original",
|
|
395
391
|
use_all_columns=True
|
|
396
392
|
)
|
|
397
393
|
save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
|
|
398
|
-
save_dir=
|
|
394
|
+
save_dir=full_path,
|
|
399
395
|
filename="Missing_Data_Original")
|
|
400
396
|
|
|
401
397
|
# Clean cycles for rows and columns
|
|
@@ -424,12 +420,12 @@ def drop_macro(df: pd.DataFrame,
|
|
|
424
420
|
# log final state + plot
|
|
425
421
|
missing_data_final = show_null_columns(
|
|
426
422
|
df=df_clean,
|
|
427
|
-
plot_to_dir=
|
|
423
|
+
plot_to_dir=full_path,
|
|
428
424
|
plot_filename="Processed",
|
|
429
425
|
use_all_columns=True
|
|
430
426
|
)
|
|
431
427
|
save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
|
|
432
|
-
save_dir=
|
|
428
|
+
save_dir=full_path,
|
|
433
429
|
filename="Missing_Data_Processed")
|
|
434
430
|
|
|
435
431
|
# return cleaned dataframe
|
|
@@ -476,9 +472,8 @@ def plot_value_distributions(
|
|
|
476
472
|
df: pd.DataFrame,
|
|
477
473
|
save_dir: Union[str, Path],
|
|
478
474
|
categorical_columns: Optional[List[str]] = None,
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
fill_na_with: str = "Missing"
|
|
475
|
+
max_categories: int = 100,
|
|
476
|
+
fill_na_with: str = "MISSING DATA"
|
|
482
477
|
):
|
|
483
478
|
"""
|
|
484
479
|
Plots and saves the value distributions for all columns in a DataFrame,
|
|
@@ -491,15 +486,9 @@ def plot_value_distributions(
|
|
|
491
486
|
Args:
|
|
492
487
|
df (pd.DataFrame): The input DataFrame to analyze.
|
|
493
488
|
save_dir (str | Path): Directory path to save the plots.
|
|
494
|
-
categorical_columns (List[str] | None): If provided,
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
categorical_cardinality_threshold (int): A numeric column will be treated
|
|
498
|
-
as 'categorical' if its number of unique values is less than or equal to this threshold. (Ignored if `categorical_columns` is set).
|
|
499
|
-
max_categories (int): The maximum number of unique categories a
|
|
500
|
-
categorical feature can have to be plotted. Features exceeding this limit will be skipped.
|
|
501
|
-
fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its
|
|
502
|
-
own category. Defaults to "Missing".
|
|
489
|
+
categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
|
|
490
|
+
max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
|
|
491
|
+
fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
|
|
503
492
|
|
|
504
493
|
Notes:
|
|
505
494
|
- `seaborn.histplot` with KDE is used for continuous features.
|
|
@@ -534,7 +523,7 @@ def plot_value_distributions(
|
|
|
534
523
|
is_continuous = True
|
|
535
524
|
else:
|
|
536
525
|
# Use auto-detection
|
|
537
|
-
if is_numeric
|
|
526
|
+
if is_numeric:
|
|
538
527
|
is_continuous = True
|
|
539
528
|
|
|
540
529
|
# --- Case 1: Continuous Numeric (Histogram) ---
|
|
@@ -549,7 +538,7 @@ def plot_value_distributions(
|
|
|
549
538
|
save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
|
|
550
539
|
numeric_plots_saved += 1
|
|
551
540
|
|
|
552
|
-
# --- Case 2: Categorical
|
|
541
|
+
# --- Case 2: Categorical (Count Plot) ---
|
|
553
542
|
else:
|
|
554
543
|
# Check max categories
|
|
555
544
|
if n_unique > max_categories:
|
|
@@ -558,7 +547,7 @@ def plot_value_distributions(
|
|
|
558
547
|
|
|
559
548
|
# Adaptive figure size
|
|
560
549
|
fig_width = max(10, n_unique * 0.5)
|
|
561
|
-
plt.figure(figsize=(fig_width,
|
|
550
|
+
plt.figure(figsize=(fig_width, 8))
|
|
562
551
|
|
|
563
552
|
# Make a temporary copy for plotting to handle NaNs
|
|
564
553
|
temp_series = df[col_name].copy()
|
|
@@ -573,7 +562,7 @@ def plot_value_distributions(
|
|
|
573
562
|
|
|
574
563
|
# Get category order by frequency
|
|
575
564
|
order = temp_series.value_counts().index
|
|
576
|
-
sns.countplot(x=temp_series, order=order, palette="
|
|
565
|
+
sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
|
|
577
566
|
|
|
578
567
|
plt.title(f"Distribution of '{col_name}' (Categorical)")
|
|
579
568
|
plt.xlabel(col_name)
|
|
@@ -743,12 +732,11 @@ def plot_categorical_vs_target(
|
|
|
743
732
|
targets: List[str],
|
|
744
733
|
save_dir: Union[str, Path],
|
|
745
734
|
features: Optional[List[str]] = None,
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
fill_na_with: str = "Missing"
|
|
735
|
+
max_categories: int = 50,
|
|
736
|
+
fill_na_with: str = "MISSING DATA"
|
|
749
737
|
):
|
|
750
738
|
"""
|
|
751
|
-
Plots each categorical feature against each numeric target using box
|
|
739
|
+
Plots each categorical feature against each numeric target using box plots.
|
|
752
740
|
|
|
753
741
|
This function is a core EDA step for regression tasks to understand the
|
|
754
742
|
relationship between a categorical independent variable and a continuous
|
|
@@ -761,7 +749,6 @@ def plot_categorical_vs_target(
|
|
|
761
749
|
targets (List[str]): A list of numeric target column names (y-axis).
|
|
762
750
|
save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
|
|
763
751
|
features (List[str] | None): A list of categorical feature column names (x-axis). If None, all non-numeric (object) columns will be used.
|
|
764
|
-
plot_type (Literal["box", "violin"]): The type of plot to generate.
|
|
765
752
|
max_categories (int): The maximum number of unique categories a feature can have to be plotted. Features exceeding this limit will be skipped.
|
|
766
753
|
fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category. Defaults to "Missing".
|
|
767
754
|
|
|
@@ -771,10 +758,6 @@ def plot_categorical_vs_target(
|
|
|
771
758
|
"""
|
|
772
759
|
# 1. Validate the base save directory and inputs
|
|
773
760
|
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
774
|
-
|
|
775
|
-
if plot_type not in ["box", "violin"]:
|
|
776
|
-
_LOGGER.error(f"Invalid plot type '{plot_type}'")
|
|
777
|
-
raise ValueError()
|
|
778
761
|
|
|
779
762
|
# 2. Validate target columns (must be numeric)
|
|
780
763
|
valid_targets = []
|
|
@@ -797,14 +780,10 @@ def plot_categorical_vs_target(
|
|
|
797
780
|
for col in df.columns:
|
|
798
781
|
if col in valid_targets:
|
|
799
782
|
continue
|
|
800
|
-
|
|
801
783
|
# Auto-include object dtypes
|
|
802
784
|
if is_object_dtype(df[col]):
|
|
803
785
|
features_to_plot.append(col)
|
|
804
|
-
|
|
805
|
-
# elif is_numeric_dtype(df[col]) and df[col].nunique() <= max_categories:
|
|
806
|
-
# _LOGGER.info(f"Treating low-cardinality numeric column '{col}' as categorical.")
|
|
807
|
-
# features_to_plot.append(col)
|
|
786
|
+
|
|
808
787
|
else:
|
|
809
788
|
# Validate user-provided list
|
|
810
789
|
for col in features:
|
|
@@ -822,12 +801,11 @@ def plot_categorical_vs_target(
|
|
|
822
801
|
|
|
823
802
|
for target_name in valid_targets:
|
|
824
803
|
# Create a sanitized subdirectory for this target
|
|
825
|
-
safe_target_dir_name = sanitize_filename(f"{target_name}
|
|
804
|
+
safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
|
|
826
805
|
target_save_dir = base_save_path / safe_target_dir_name
|
|
827
806
|
target_save_dir.mkdir(parents=True, exist_ok=True)
|
|
828
807
|
|
|
829
|
-
_LOGGER.info(f"Generating
|
|
830
|
-
|
|
808
|
+
_LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
|
|
831
809
|
for feature_name in features_to_plot:
|
|
832
810
|
|
|
833
811
|
# Make a temporary copy for plotting to handle NaNs and dtypes
|
|
@@ -849,12 +827,9 @@ def plot_categorical_vs_target(
|
|
|
849
827
|
|
|
850
828
|
# 5. Create the plot
|
|
851
829
|
# Increase figure width for categories
|
|
852
|
-
plt.figure(figsize=(max(10, n_unique * 1.2),
|
|
830
|
+
plt.figure(figsize=(max(10, n_unique * 1.2), 10))
|
|
853
831
|
|
|
854
|
-
|
|
855
|
-
sns.boxplot(x=feature_name, y=target_name, data=temp_df)
|
|
856
|
-
elif plot_type == "violin":
|
|
857
|
-
sns.violinplot(x=feature_name, y=target_name, data=temp_df)
|
|
832
|
+
sns.boxplot(x=feature_name, y=target_name, data=temp_df)
|
|
858
833
|
|
|
859
834
|
plt.title(f'{target_name} vs {feature_name}')
|
|
860
835
|
plt.xlabel(feature_name)
|
|
@@ -982,7 +957,7 @@ def encode_categorical_features(
|
|
|
982
957
|
|
|
983
958
|
# Handle the dataset splitting logic
|
|
984
959
|
if split_resulting_dataset:
|
|
985
|
-
df_categorical = df_encoded[valid_columns]
|
|
960
|
+
df_categorical = df_encoded[valid_columns]
|
|
986
961
|
df_non_categorical = df.drop(columns=valid_columns)
|
|
987
962
|
return mappings, df_non_categorical, df_categorical
|
|
988
963
|
else:
|
|
File without changes
|
{dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.13.0.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|