py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/index +0 -0
- py2ls/ips.py +722 -12
- py2ls/ml2ls copy.py +2906 -0
- py2ls/ml2ls.py +345 -12
- py2ls/plot.py +409 -24
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.16.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.16.dist-info}/RECORD +8 -7
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.16.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -3163,14 +3163,19 @@ def listdir(
|
|
3163
3163
|
if kind is None:
|
3164
3164
|
ls = os.listdir(rootdir)
|
3165
3165
|
ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
|
3166
|
-
|
3166
|
+
if verbose:
|
3167
|
+
if len(ls)>20:
|
3168
|
+
print(ls[:20])
|
3169
|
+
else:
|
3170
|
+
print(ls)
|
3167
3171
|
df_all = pd.DataFrame(
|
3168
3172
|
{
|
3169
3173
|
"fname": ls,
|
3170
3174
|
"fpath": [os.path.join(rootdir, i) for i in ls],
|
3171
3175
|
}
|
3172
3176
|
)
|
3173
|
-
|
3177
|
+
if verbose:
|
3178
|
+
display(df_all.head())
|
3174
3179
|
return df_all
|
3175
3180
|
if isinstance(kind, list):
|
3176
3181
|
f_ = []
|
@@ -3206,6 +3211,7 @@ def listdir(
|
|
3206
3211
|
"size": [],
|
3207
3212
|
"fname": [],
|
3208
3213
|
"fpath": [],
|
3214
|
+
"basename":[],
|
3209
3215
|
}
|
3210
3216
|
for item in ls:
|
3211
3217
|
item_path = os.path.join(rootdir, item)
|
@@ -3228,6 +3234,7 @@ def listdir(
|
|
3228
3234
|
f["length"].append(len(filename))
|
3229
3235
|
f["path"].append(os.path.join(os.path.dirname(item_path), item))
|
3230
3236
|
fpath = os.path.join(os.path.dirname(item_path), item)
|
3237
|
+
basename=os.path.basename(item_path)
|
3231
3238
|
f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
|
3232
3239
|
f["created_time"].append(
|
3233
3240
|
pd.to_datetime(os.path.getctime(item_path), unit="s")
|
@@ -3240,6 +3247,7 @@ def listdir(
|
|
3240
3247
|
)
|
3241
3248
|
f["fname"].append(filename) # will be removed
|
3242
3249
|
f["fpath"].append(fpath) # will be removed
|
3250
|
+
f['basename'].append(basename)
|
3243
3251
|
i += 1
|
3244
3252
|
|
3245
3253
|
f["num"] = i
|
@@ -3462,7 +3470,6 @@ def figsave(*args, dpi=300):
|
|
3462
3470
|
img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
|
3463
3471
|
elif isinstance(img, np.ndarray):
|
3464
3472
|
import cv2
|
3465
|
-
|
3466
3473
|
# Check the shape of the image to determine color mode
|
3467
3474
|
if img.ndim == 2:
|
3468
3475
|
# Grayscale image
|
@@ -5055,16 +5062,22 @@ def _df_outlier(
|
|
5055
5062
|
from scipy.stats import zscore
|
5056
5063
|
from sklearn.ensemble import IsolationForest
|
5057
5064
|
from sklearn.preprocessing import StandardScaler
|
5058
|
-
|
5065
|
+
|
5066
|
+
# Fill completely NaN columns with a default value (e.g., 0)
|
5067
|
+
data = data.copy()
|
5068
|
+
data.loc[:, data.isna().all()] = 0
|
5069
|
+
if columns is not None:
|
5070
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
5071
|
+
data=data[columns]
|
5059
5072
|
col_names_org = data.columns.tolist()
|
5060
5073
|
index_names_org = data.index.tolist()
|
5061
5074
|
# Separate numeric and non-numeric columns
|
5062
5075
|
numeric_data = data.select_dtypes(include=[np.number])
|
5063
5076
|
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
5064
5077
|
|
5065
|
-
if columns is not None:
|
5066
|
-
|
5067
|
-
|
5078
|
+
# if columns is not None:
|
5079
|
+
# numeric_data = numeric_data[columns]
|
5080
|
+
if numeric_data.empty:
|
5068
5081
|
raise ValueError("Input data must contain numeric columns.")
|
5069
5082
|
|
5070
5083
|
outliers_df = pd.DataFrame(index=numeric_data.index)
|
@@ -5626,6 +5639,10 @@ def df_fillna(
|
|
5626
5639
|
for col in data.columns:
|
5627
5640
|
data[col] = data[col].apply(lambda x: np.nan if x is None else x)
|
5628
5641
|
|
5642
|
+
# Fill completely NaN columns with a default value (e.g., 0)
|
5643
|
+
data = data.copy()
|
5644
|
+
data.loc[:, data.isna().all()] = 0
|
5645
|
+
|
5629
5646
|
col_names_org = data.columns.tolist()
|
5630
5647
|
index_names_org = data.index.tolist()
|
5631
5648
|
# Separate numeric and non-numeric columns
|
@@ -5682,11 +5699,11 @@ def df_fillna(
|
|
5682
5699
|
imputed_data = imputer.fit_transform(numeric_data.T)
|
5683
5700
|
else:
|
5684
5701
|
raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
|
5685
|
-
|
5702
|
+
|
5686
5703
|
imputed_data = pd.DataFrame(
|
5687
5704
|
imputed_data if axis == 0 else imputed_data.T,
|
5688
|
-
index=numeric_data.index if axis == 0 else
|
5689
|
-
columns=numeric_data.columns if axis == 0 else
|
5705
|
+
index=numeric_data.index if axis == 0 else numeric_data.columns,
|
5706
|
+
columns=numeric_data.columns if axis == 0 else numeric_data.index,
|
5690
5707
|
)
|
5691
5708
|
for col in imputed_data.select_dtypes(include=[np.number]).columns:
|
5692
5709
|
imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
|
@@ -5826,8 +5843,13 @@ def df_encoder(
|
|
5826
5843
|
from sklearn.preprocessing import LabelEncoder
|
5827
5844
|
|
5828
5845
|
encoder = LabelEncoder()
|
5829
|
-
|
5830
|
-
|
5846
|
+
# Apply LabelEncoder only to non-numeric columns
|
5847
|
+
non_numeric_columns = [col for col in columns if not pd.api.types.is_numeric_dtype(data[col])]
|
5848
|
+
|
5849
|
+
if not non_numeric_columns:
|
5850
|
+
return data
|
5851
|
+
encoded_data = data[non_numeric_columns].apply(lambda col: encoder.fit_transform(col))
|
5852
|
+
return pd.concat([data.drop(non_numeric_columns, axis=1), encoded_data], axis=1)
|
5831
5853
|
|
5832
5854
|
# Target encoding (Mean of the target for each category)
|
5833
5855
|
elif method == "target":
|
@@ -6878,7 +6900,188 @@ def df_reducer(
|
|
6878
6900
|
# example:
|
6879
6901
|
# df_reducer(data=data_log, columns=markers, n_components=2)
|
6880
6902
|
|
6903
|
+
def df_format(data, threshold_unique=0.5, verbose=False):
|
6904
|
+
"""
|
6905
|
+
检测表格: long, wide or uncertain.
|
6906
|
+
|
6907
|
+
Parameters:
|
6908
|
+
- data (pd.DataFrame): DataFrame to check.
|
6909
|
+
- threshold_unique (float): Proportion threshold for detecting categorical columns.
|
6910
|
+
|
6911
|
+
Returns:
|
6912
|
+
- "long" if detected as long format,
|
6913
|
+
- "wide" if detected as wide format
|
6914
|
+
- "uncertain" if ambiguous.
|
6915
|
+
"""
|
6916
|
+
from scipy.stats import entropy
|
6917
|
+
from sklearn.cluster import AgglomerativeClustering
|
6918
|
+
from sklearn.preprocessing import StandardScaler
|
6919
|
+
|
6920
|
+
long_score = 0
|
6921
|
+
wide_score = 0
|
6922
|
+
|
6923
|
+
n_rows, n_cols = data.shape
|
6924
|
+
|
6925
|
+
# Step 1: Row-Column Ratio Heuristic
|
6926
|
+
if n_rows > 3 * n_cols:
|
6927
|
+
long_score += 2
|
6928
|
+
if verbose:
|
6929
|
+
print(
|
6930
|
+
"Row-Column Ratio suggests long format (many rows relative to columns)."
|
6931
|
+
)
|
6932
|
+
elif n_cols > 3 * n_rows:
|
6933
|
+
wide_score += 2
|
6934
|
+
if verbose:
|
6935
|
+
print(
|
6936
|
+
"Row-Column Ratio suggests wide format (many columns relative to rows)."
|
6937
|
+
)
|
6938
|
+
|
6939
|
+
# Step 2: Unique-to-duplicate ratio and entropy for categorical variables
|
6940
|
+
unique_counts = data.apply(lambda x: x.nunique())
|
6941
|
+
duplicate_ratio = 1 - unique_counts / n_rows
|
6942
|
+
if (duplicate_ratio > 0.2).sum() > 0.5 * n_cols:
|
6943
|
+
wide_score += 2
|
6944
|
+
if verbose:
|
6945
|
+
print("High duplicate values in columns suggest wide format.")
|
6946
|
+
else:
|
6947
|
+
long_score += 1
|
6948
|
+
if verbose:
|
6949
|
+
print(
|
6950
|
+
"Lower duplicate ratio suggests long format (higher row variability)."
|
6951
|
+
)
|
6881
6952
|
|
6953
|
+
# Calculate entropy for categorical columns
|
6954
|
+
categorical_cols = data.select_dtypes(include=["object", "category"]).columns
|
6955
|
+
if len(categorical_cols) > 0:
|
6956
|
+
for col in categorical_cols:
|
6957
|
+
counts = data[col].value_counts(normalize=True)
|
6958
|
+
col_entropy = entropy(counts)
|
6959
|
+
if col_entropy < 1.5:
|
6960
|
+
long_score += 1
|
6961
|
+
if verbose:
|
6962
|
+
print(
|
6963
|
+
f"Column '{col}' entropy suggests categorical, supporting long format."
|
6964
|
+
)
|
6965
|
+
else:
|
6966
|
+
wide_score += 1
|
6967
|
+
if verbose:
|
6968
|
+
print(f"Column '{col}' entropy is higher, supporting wide format.")
|
6969
|
+
|
6970
|
+
# Step 3: Column grouping analysis for patterns in suffixes/prefixes
|
6971
|
+
col_names = data.columns.astype(str)
|
6972
|
+
suffix_count = sum("_" in col or col[-1].isdigit() for col in col_names)
|
6973
|
+
if suffix_count > 0.3 * n_cols:
|
6974
|
+
wide_score += 2
|
6975
|
+
if verbose:
|
6976
|
+
print(
|
6977
|
+
"Detected suffix/prefix patterns in column names, suggesting wide format."
|
6978
|
+
)
|
6979
|
+
|
6980
|
+
# Step 4: Entity identifier detection for long format with categorical columns
|
6981
|
+
if len(categorical_cols) > 0 and n_rows > n_cols:
|
6982
|
+
entity_identifier_count = sum(
|
6983
|
+
data.duplicated(subset=categorical_cols, keep=False)
|
6984
|
+
)
|
6985
|
+
if entity_identifier_count > 0.2 * n_rows:
|
6986
|
+
long_score += 2
|
6987
|
+
if verbose:
|
6988
|
+
print(
|
6989
|
+
"Significant duplicate rows based on categorical columns, suggesting long format."
|
6990
|
+
)
|
6991
|
+
|
6992
|
+
# Step 5: Clustering analysis on numerical columns for correlation in wide format
|
6993
|
+
numeric_cols = data.select_dtypes(include="number").columns
|
6994
|
+
if len(numeric_cols) > 1:
|
6995
|
+
scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
|
6996
|
+
clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
|
6997
|
+
cluster_labels = pd.Series(clustering.labels_)
|
6998
|
+
if cluster_labels.nunique() < len(numeric_cols) * 0.5:
|
6999
|
+
wide_score += 2
|
7000
|
+
if verbose:
|
7001
|
+
print("Clustering on columns shows grouping, suggesting wide format.")
|
7002
|
+
|
7003
|
+
# Step 6: Inter-column correlation analysis
|
7004
|
+
if len(numeric_cols) > 1:
|
7005
|
+
corr_matrix = data[numeric_cols].corr().abs()
|
7006
|
+
avg_corr = (
|
7007
|
+
corr_matrix.where(~np.eye(len(corr_matrix), dtype=bool)).mean().mean()
|
7008
|
+
)
|
7009
|
+
if avg_corr > 0.6:
|
7010
|
+
wide_score += 2
|
7011
|
+
if verbose:
|
7012
|
+
print("High inter-column correlation suggests wide format.")
|
7013
|
+
|
7014
|
+
# Step 7: Missing value pattern analysis
|
7015
|
+
missing_patterns = data.isna().sum(axis=1)
|
7016
|
+
if missing_patterns.std() < 2:
|
7017
|
+
wide_score += 1
|
7018
|
+
if verbose:
|
7019
|
+
print(
|
7020
|
+
"Low variation in missing patterns across rows, supporting wide format."
|
7021
|
+
)
|
7022
|
+
elif missing_patterns.mean() < 1:
|
7023
|
+
long_score += 1
|
7024
|
+
if verbose:
|
7025
|
+
print("Lower missing pattern suggests long format (less structured).")
|
7026
|
+
|
7027
|
+
# Step 8: Multi-level clustering on rows to detect block structure for wide format
|
7028
|
+
if len(numeric_cols) > 1 and n_rows > 5:
|
7029
|
+
clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
|
7030
|
+
if pd.Series(clustering_rows.labels_).nunique() < 2:
|
7031
|
+
wide_score += 2
|
7032
|
+
if verbose:
|
7033
|
+
print("Row clustering reveals homogeneity, suggesting wide format.")
|
7034
|
+
|
7035
|
+
# Step 9: Sequential name detection for time-series pattern in wide format
|
7036
|
+
if any(col.isdigit() or col.startswith("T") for col in col_names):
|
7037
|
+
wide_score += 1
|
7038
|
+
if verbose:
|
7039
|
+
print("Detected time-like sequential column names, supporting wide format.")
|
7040
|
+
|
7041
|
+
# Step 10: Entropy of numeric columns
|
7042
|
+
numeric_entropy = data[numeric_cols].apply(
|
7043
|
+
lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
|
7044
|
+
)
|
7045
|
+
if numeric_entropy.mean() < 2:
|
7046
|
+
wide_score += 2
|
7047
|
+
if verbose:
|
7048
|
+
print(
|
7049
|
+
"Low entropy in numeric columns indicates stability across columns, supporting wide format."
|
7050
|
+
)
|
7051
|
+
|
7052
|
+
# Step 11: Tie-breaking strategy if scores are equal
|
7053
|
+
if wide_score == long_score:
|
7054
|
+
if n_cols > n_rows:
|
7055
|
+
wide_score += 1
|
7056
|
+
if verbose:
|
7057
|
+
print(
|
7058
|
+
"Tie-breaking based on column-major structure, favoring wide format."
|
7059
|
+
)
|
7060
|
+
elif n_rows > n_cols:
|
7061
|
+
long_score += 1
|
7062
|
+
if verbose:
|
7063
|
+
print(
|
7064
|
+
"Tie-breaking based on row-major structure, favoring long format."
|
7065
|
+
)
|
7066
|
+
else:
|
7067
|
+
if verbose:
|
7068
|
+
print("Tie-breaking inconclusive; returning 'uncertain'.")
|
7069
|
+
return "uncertain"
|
7070
|
+
|
7071
|
+
# Final decision
|
7072
|
+
if wide_score > long_score:
|
7073
|
+
if verbose:
|
7074
|
+
print("Final decision: Wide format.")
|
7075
|
+
return "wide"
|
7076
|
+
elif long_score > wide_score:
|
7077
|
+
if verbose:
|
7078
|
+
print("Final decision: Long format.")
|
7079
|
+
return "long"
|
7080
|
+
else:
|
7081
|
+
if verbose:
|
7082
|
+
print("Final decision: Uncertain format.")
|
7083
|
+
return "uncertain"
|
7084
|
+
|
6882
7085
|
def plot_cluster(
|
6883
7086
|
data: pd.DataFrame,
|
6884
7087
|
labels: np.ndarray,
|
@@ -7126,7 +7329,514 @@ def evaluate_cluster(
|
|
7126
7329
|
metrics["V-Measure"] = np.nan
|
7127
7330
|
|
7128
7331
|
return metrics
|
7332
|
+
def df_qc(
|
7333
|
+
data: pd.DataFrame,
|
7334
|
+
columns=None,
|
7335
|
+
verbose=False,
|
7336
|
+
plot_=True,
|
7337
|
+
max_cols=20, # only for plots
|
7338
|
+
output=False,
|
7339
|
+
):
|
7340
|
+
"""
|
7341
|
+
Usage example:
|
7342
|
+
df = pd.DataFrame(...) # Your DataFrameres_qc = df_qc(df)
|
7343
|
+
"""
|
7344
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
7345
|
+
from scipy.stats import skew, kurtosis, entropy
|
7346
|
+
import skimpy
|
7347
|
+
|
7348
|
+
#! display(data.select_dtypes(include=[np.number]).describe())
|
7349
|
+
#!skim
|
7350
|
+
if columns is not None:
|
7351
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7352
|
+
data=data[columns]
|
7353
|
+
try:
|
7354
|
+
skimpy.skim(data)
|
7355
|
+
except:
|
7356
|
+
numerical_data = data.select_dtypes(include=[np.number])
|
7357
|
+
skimpy.skim(numerical_data)
|
7358
|
+
# Fill completely NaN columns with a default value (e.g., 0)
|
7359
|
+
data = data.copy()
|
7360
|
+
data.loc[:, data.isna().all()] = 0
|
7361
|
+
res_qc = {}
|
7362
|
+
|
7363
|
+
# Missing values
|
7364
|
+
res_qc["missing_values"] = data.isnull().sum()
|
7365
|
+
res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
|
7366
|
+
res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
|
7367
|
+
|
7368
|
+
# Data types and unique values
|
7369
|
+
res_qc["data_types"] = data.dtypes
|
7370
|
+
res_qc["unique_values"] = data.nunique()
|
7371
|
+
res_qc["constant_columns"] = [
|
7372
|
+
col for col in data.columns if data[col].nunique() <= 1
|
7373
|
+
]
|
7374
|
+
|
7375
|
+
# Duplicate rows and columns
|
7376
|
+
res_qc["duplicate_rows"] = data.duplicated().sum()
|
7377
|
+
res_qc["duplicate_columns"] = data.columns[data.columns.duplicated()].tolist()
|
7378
|
+
|
7379
|
+
# Empty columns
|
7380
|
+
res_qc["empty_columns"] = [col for col in data.columns if data[col].isnull().all()]
|
7381
|
+
|
7382
|
+
# outliers
|
7383
|
+
data_outliers = df_outlier(data)
|
7384
|
+
outlier_num = data_outliers.isna().sum() - data.isnull().sum()
|
7385
|
+
res_qc["outlier_num"] = outlier_num[outlier_num > 0]
|
7386
|
+
outlier_percentage=(outlier_num / len(data_outliers)) * 100
|
7387
|
+
res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
|
7388
|
+
# Correlation and multicollinearity (VIF)
|
7389
|
+
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7390
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7391
|
+
corr_matrix = numeric_df.corr()
|
7392
|
+
high_corr_pairs = [
|
7393
|
+
(col1, col2)
|
7394
|
+
for col1 in corr_matrix.columns
|
7395
|
+
for col2 in corr_matrix.columns
|
7396
|
+
if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
|
7397
|
+
]
|
7398
|
+
res_qc["high_correlations"] = high_corr_pairs
|
7399
|
+
|
7400
|
+
# VIF for multicollinearity check
|
7401
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7402
|
+
vif_data = pd.DataFrame()
|
7403
|
+
res_qc["vif"]=vif_data
|
7404
|
+
if numeric_df.shape[1] > 1:
|
7405
|
+
vif_data["feature"] = numeric_df.columns
|
7406
|
+
vif_data["VIF"] = [
|
7407
|
+
variance_inflation_factor(numeric_df.values, i)
|
7408
|
+
for i in range(numeric_df.shape[1])
|
7409
|
+
]
|
7410
|
+
res_qc["vif"] = vif_data[
|
7411
|
+
vif_data["VIF"] > 5
|
7412
|
+
] # Typically VIF > 5 indicates multicollinearity
|
7413
|
+
# Skewness and Kurtosis
|
7414
|
+
skewness = data.skew(numeric_only=True)
|
7415
|
+
kurtosis_vals = data.kurt(numeric_only=True)
|
7416
|
+
res_qc["skewness"] = skewness[abs(skewness) > 1]
|
7417
|
+
res_qc["kurtosis"] = kurtosis_vals[abs(kurtosis_vals) > 3]
|
7418
|
+
|
7419
|
+
# Entropy for categorical columns (higher entropy suggests more disorder)
|
7420
|
+
categorical_cols = data.select_dtypes(include=["object", "category"]).columns
|
7421
|
+
res_qc["entropy_categoricals"] = {
|
7422
|
+
col: entropy(data[col].value_counts(normalize=True), base=2)
|
7423
|
+
for col in categorical_cols
|
7424
|
+
}
|
7425
|
+
# number of unique
|
7426
|
+
res_qc["unique_counts"] = data.nunique()
|
7427
|
+
# dtypes counts
|
7428
|
+
res_qc['dtype_counts']=data.dtypes.value_counts()
|
7429
|
+
|
7430
|
+
# Distribution Analysis (mean, median, mode, std dev, IQR for numeric columns)
|
7431
|
+
distribution_stats = data.select_dtypes(include=[np.number]).describe().T
|
7432
|
+
iqr = data.select_dtypes(include=[np.number]).apply(
|
7433
|
+
lambda x: x.quantile(0.75) - x.quantile(0.25)
|
7434
|
+
)
|
7435
|
+
distribution_stats["IQR"] = iqr
|
7436
|
+
res_qc["distribution_analysis"] = distribution_stats
|
7437
|
+
|
7438
|
+
# Variance Check: Identify low-variance columns
|
7439
|
+
variance_threshold = 0.01
|
7440
|
+
low_variance_cols = [
|
7441
|
+
col
|
7442
|
+
for col in data.select_dtypes(include=[np.number]).columns
|
7443
|
+
if data[col].var() < variance_threshold
|
7444
|
+
]
|
7445
|
+
res_qc["low_variance_features"] = low_variance_cols
|
7446
|
+
|
7447
|
+
# Categorical columns and cardinality
|
7448
|
+
categorical_cols = data.select_dtypes(include=["object", "category"]).columns
|
7449
|
+
high_cardinality = {
|
7450
|
+
col: data[col].nunique() for col in categorical_cols if data[col].nunique() > 50
|
7451
|
+
}
|
7452
|
+
res_qc["high_cardinality_categoricals"] = high_cardinality
|
7453
|
+
|
7454
|
+
# Feature-type inconsistency (mixed types in columns)
|
7455
|
+
inconsistent_types = {}
|
7456
|
+
for col in data.columns:
|
7457
|
+
unique_types = set(type(val) for val in data[col].dropna())
|
7458
|
+
if len(unique_types) > 1:
|
7459
|
+
inconsistent_types[col] = unique_types
|
7460
|
+
res_qc["inconsistent_types"] = inconsistent_types
|
7461
|
+
|
7462
|
+
|
7463
|
+
# Text length analysis for text fields
|
7464
|
+
text_lengths = {}
|
7465
|
+
for col in categorical_cols:
|
7466
|
+
text_lengths[col] = {
|
7467
|
+
"avg_length": data[col].dropna().apply(len).mean(),
|
7468
|
+
"length_variance": data[col].dropna().apply(len).var(),
|
7469
|
+
}
|
7470
|
+
res_qc["text_length_analysis"] = text_lengths
|
7471
|
+
|
7472
|
+
# Summary statistics
|
7473
|
+
res_qc["summary_statistics"] = data.describe().T
|
7474
|
+
|
7475
|
+
# Automated warnings
|
7476
|
+
warnings = []
|
7477
|
+
if res_qc["duplicate_rows"] > 0:
|
7478
|
+
warnings.append("Warning: Duplicate rows detected.")
|
7479
|
+
if len(res_qc["empty_columns"]) > 0:
|
7480
|
+
warnings.append("Warning: Columns with only NaN values detected.")
|
7481
|
+
if len(res_qc["constant_columns"]) > 0:
|
7482
|
+
warnings.append("Warning: Columns with a single constant value detected.")
|
7483
|
+
if len(high_corr_pairs) > 0:
|
7484
|
+
warnings.append("Warning: Highly correlated columns detected.")
|
7485
|
+
if len(res_qc["vif"]) > 0:
|
7486
|
+
warnings.append("Warning: Multicollinearity detected in features.")
|
7487
|
+
if len(high_cardinality) > 0:
|
7488
|
+
warnings.append("Warning: High cardinality in categorical columns.")
|
7489
|
+
if len(inconsistent_types) > 0:
|
7490
|
+
warnings.append("Warning: Columns with mixed data types detected.")
|
7491
|
+
res_qc["warnings"] = warnings
|
7492
|
+
|
7493
|
+
# Report generation
|
7494
|
+
if verbose:
|
7495
|
+
print("=== QC Report Summary ===")
|
7496
|
+
print("\nMissing Values (Total and %):")
|
7497
|
+
print(res_qc["missing_values"][res_qc["missing_values"] > 0])
|
7498
|
+
print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
|
7499
|
+
|
7500
|
+
print("\nRows with Missing Values:", res_qc["rows_with_missing"])
|
7501
|
+
|
7502
|
+
print("\nData Types:")
|
7503
|
+
print(res_qc["data_types"])
|
7504
|
+
|
7505
|
+
print("\nUnique Values per Column:")
|
7506
|
+
print(res_qc["unique_values"])
|
7507
|
+
|
7508
|
+
print("\nConstant Columns:", res_qc["constant_columns"])
|
7509
|
+
|
7510
|
+
print("\nDuplicate Rows:", res_qc["duplicate_rows"])
|
7511
|
+
print("Duplicate Columns:", res_qc["duplicate_columns"])
|
7512
|
+
|
7513
|
+
if res_qc["empty_columns"]:
|
7514
|
+
print("\nEmpty Columns:", res_qc["empty_columns"])
|
7515
|
+
|
7516
|
+
print("\nOutlier Report:")
|
7517
|
+
print(res_qc["outlier_num"])
|
7518
|
+
print("\nPercentage of Values Replaced per Column:")
|
7519
|
+
print(res_qc["outlier_percentage"])
|
7520
|
+
|
7521
|
+
print("\nHigh Correlations (>|0.9|):")
|
7522
|
+
for col1, col2 in res_qc["high_correlations"]:
|
7523
|
+
print(f" {col1} and {col2}")
|
7524
|
+
|
7525
|
+
if "vif" in res_qc:
|
7526
|
+
print("\nFeatures with High VIF (>|5|):")
|
7527
|
+
print(res_qc["vif"])
|
7528
|
+
|
7529
|
+
print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
|
7530
|
+
print(res_qc["high_cardinality_categoricals"])
|
7531
|
+
|
7532
|
+
print("\nInconsistent Data Types:")
|
7533
|
+
print(res_qc["inconsistent_types"])
|
7534
|
+
|
7535
|
+
print("\nRange Checks for Numeric Columns:")
|
7536
|
+
print(res_qc["range_checks"])
|
7537
|
+
|
7538
|
+
print("\nText Length Analysis:")
|
7539
|
+
for col, stats in res_qc["text_length_analysis"].items():
|
7540
|
+
print(
|
7541
|
+
f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
|
7542
|
+
)
|
7543
|
+
|
7544
|
+
print("\nSummary Statistics:")
|
7545
|
+
print(res_qc["summary_statistics"])
|
7129
7546
|
|
7547
|
+
if res_qc["warnings"]:
|
7548
|
+
print("\nWarnings:")
|
7549
|
+
for warning in res_qc["warnings"]:
|
7550
|
+
print(" -", warning)
|
7551
|
+
if plot_:
|
7552
|
+
df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
|
7553
|
+
if output:
|
7554
|
+
return res_qc
|
7555
|
+
return None
|
7556
|
+
|
7557
|
+
|
7558
|
+
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
|
7559
|
+
import matplotlib.pyplot as plt
|
7560
|
+
import seaborn as sns
|
7561
|
+
from .plot import subplot, figsets, get_color
|
7562
|
+
|
7563
|
+
if columns is not None:
|
7564
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7565
|
+
data=data[columns]
|
7566
|
+
len_total = len(res_qc)
|
7567
|
+
n_row, n_col = int((len_total + 10) / 3), 3
|
7568
|
+
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
7569
|
+
|
7570
|
+
missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
|
7571
|
+
ascending=False
|
7572
|
+
)
|
7573
|
+
if len(missing_data) > max_cols:
|
7574
|
+
missing_data = missing_data[:max_cols]
|
7575
|
+
ax=sns.barplot(
|
7576
|
+
x=missing_data.index,
|
7577
|
+
y=missing_data.values,
|
7578
|
+
hue=missing_data.index,
|
7579
|
+
palette=get_color(len(missing_data), cmap="Blues")[::-1],
|
7580
|
+
ax=nexttile(),
|
7581
|
+
)
|
7582
|
+
figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
|
7583
|
+
|
7584
|
+
ax2 = ax.twinx()
|
7585
|
+
# Plot missing value percentages
|
7586
|
+
missing_percentage = res_qc["missing_percentage"][
|
7587
|
+
res_qc["missing_percentage"] > 0
|
7588
|
+
].sort_values(ascending=False)
|
7589
|
+
sns.barplot(
|
7590
|
+
x=missing_percentage.index,
|
7591
|
+
y=missing_percentage.values,
|
7592
|
+
hue=missing_percentage.index,
|
7593
|
+
palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
|
7594
|
+
ax=ax2,#nexttile(),
|
7595
|
+
)
|
7596
|
+
figsets(xangle=45, ylabel="%",ax=ax2)
|
7597
|
+
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7598
|
+
ax2.yaxis.label.set_color('r')
|
7599
|
+
|
7600
|
+
outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
|
7601
|
+
if len(outlier_num) > max_cols:
|
7602
|
+
outlier_num = outlier_num[:max_cols]
|
7603
|
+
ax_outlier_num=sns.barplot(
|
7604
|
+
x=outlier_num.index,
|
7605
|
+
y=outlier_num.values,
|
7606
|
+
hue=outlier_num.index,
|
7607
|
+
palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
|
7608
|
+
ax=nexttile(),
|
7609
|
+
)
|
7610
|
+
figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
|
7611
|
+
ax_outlier_percentage = ax_outlier_num.twinx()
|
7612
|
+
outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
|
7613
|
+
if len(outlier_percentage) > max_cols:
|
7614
|
+
outlier_percentage = outlier_percentage[:max_cols]
|
7615
|
+
ax_outlier_percentage=sns.barplot(
|
7616
|
+
x=outlier_percentage.index,
|
7617
|
+
y=outlier_percentage.values,
|
7618
|
+
hue=outlier_percentage.index,
|
7619
|
+
palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
|
7620
|
+
ax=ax2 #nexttile(),
|
7621
|
+
)
|
7622
|
+
figsets(
|
7623
|
+
xangle=45,
|
7624
|
+
ylabel="%",
|
7625
|
+
xlabel=None,
|
7626
|
+
ylim=[0, outlier_percentage.max() + 2],
|
7627
|
+
ax=ax_outlier_percentage
|
7628
|
+
)
|
7629
|
+
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7630
|
+
ax2.yaxis.label.set_color('r')
|
7631
|
+
|
7632
|
+
# Skewness and Kurtosis Plots
|
7633
|
+
skewness = res_qc["skewness"].sort_values(ascending=False)
|
7634
|
+
kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
|
7635
|
+
if not skewness.empty:
|
7636
|
+
ax_skewness=sns.barplot(
|
7637
|
+
x=skewness.index,
|
7638
|
+
y=skewness.values,
|
7639
|
+
hue=skewness.index,
|
7640
|
+
palette=get_color(len(skewness), cmap="coolwarm")[::-1],
|
7641
|
+
ax=nexttile(),
|
7642
|
+
)
|
7643
|
+
figsets(
|
7644
|
+
xangle=45,
|
7645
|
+
title="Highly Skewed Numeric Columns (Skewness > 1)",
|
7646
|
+
ylabel="Skewness",xlabel=None,ax=ax_skewness
|
7647
|
+
)
|
7648
|
+
if not kurtosis.empty:
|
7649
|
+
ax_kurtosis=sns.barplot(
|
7650
|
+
x=kurtosis.index,
|
7651
|
+
y=kurtosis.values,
|
7652
|
+
hue=kurtosis.index,
|
7653
|
+
palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
|
7654
|
+
ax=nexttile(),
|
7655
|
+
)
|
7656
|
+
figsets(
|
7657
|
+
xangle=45,
|
7658
|
+
title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
|
7659
|
+
ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
|
7660
|
+
)
|
7661
|
+
|
7662
|
+
# Entropy for Categorical Variables
|
7663
|
+
entropy_data = pd.Series(res_qc["entropy_categoricals"]).sort_values(
|
7664
|
+
ascending=False
|
7665
|
+
)
|
7666
|
+
ax_entropy_data=sns.barplot(
|
7667
|
+
x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
|
7668
|
+
)
|
7669
|
+
figsets(
|
7670
|
+
xangle=45,
|
7671
|
+
xlabel="Categorical Columns",
|
7672
|
+
title="Entropy of Categorical Variables",
|
7673
|
+
ylabel="Entropy (bits)",
|
7674
|
+
ax=ax_entropy_data
|
7675
|
+
)
|
7676
|
+
# Distribution Analysis: Boxplot for IQR
|
7677
|
+
ax_iqr=sns.boxplot(
|
7678
|
+
data=data[res_qc["distribution_analysis"].index],
|
7679
|
+
orient="v",
|
7680
|
+
palette="Set3",
|
7681
|
+
ax=nexttile(),
|
7682
|
+
)
|
7683
|
+
figsets(
|
7684
|
+
xangle=45,
|
7685
|
+
title="Range for Numeric Columns",
|
7686
|
+
ylabel="#",
|
7687
|
+
ax=ax_iqr
|
7688
|
+
)
|
7689
|
+
# unique counts
|
7690
|
+
unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
|
7691
|
+
ax_unique_counts_=sns.barplot(
|
7692
|
+
x=unique_counts.index,
|
7693
|
+
y=unique_counts.values,
|
7694
|
+
hue=unique_counts.index,
|
7695
|
+
palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
|
7696
|
+
ax=nexttile())
|
7697
|
+
figsets(
|
7698
|
+
xangle=45,
|
7699
|
+
title="Unique Counts",
|
7700
|
+
xlabel=None,
|
7701
|
+
ylabel="#",
|
7702
|
+
ax=ax_unique_counts_
|
7703
|
+
)
|
7704
|
+
# Binary Checking
|
7705
|
+
ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
|
7706
|
+
y=unique_counts[unique_counts<10].values,
|
7707
|
+
hue=unique_counts[unique_counts<10].index,
|
7708
|
+
palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
|
7709
|
+
ax=nexttile())
|
7710
|
+
plt.axhline(y=2, color="r", linestyle="--", lw=2)
|
7711
|
+
figsets(
|
7712
|
+
xangle=45,
|
7713
|
+
xlabel=None,
|
7714
|
+
title="Binary Checking",
|
7715
|
+
ylabel="#",
|
7716
|
+
ax=ax_unique_counts
|
7717
|
+
)
|
7718
|
+
|
7719
|
+
# dtypes counts
|
7720
|
+
dtype_counts = res_qc['dtype_counts']
|
7721
|
+
txt = []
|
7722
|
+
for tp in dtype_counts.index:
|
7723
|
+
txt.append(list(data.select_dtypes(include=tp).columns))
|
7724
|
+
|
7725
|
+
ax_dtype_counts = sns.barplot(
|
7726
|
+
x=dtype_counts.index,
|
7727
|
+
y=dtype_counts.values,
|
7728
|
+
color="#F3C8B2",
|
7729
|
+
ax=nexttile(),
|
7730
|
+
)
|
7731
|
+
max_columns_per_row = 1 # Maximum number of columns per row
|
7732
|
+
for i, tp in enumerate(dtype_counts.index):
|
7733
|
+
if i<=20:
|
7734
|
+
column_names = txt[i]
|
7735
|
+
# Split the column names into multiple lines if too long
|
7736
|
+
column_name_str = ", ".join(column_names)
|
7737
|
+
if len(column_name_str) > 40: # If column names are too long, split them
|
7738
|
+
column_name_str = "\n".join(
|
7739
|
+
[
|
7740
|
+
", ".join(column_names[j : j + max_columns_per_row])
|
7741
|
+
for j in range(0, len(column_names), max_columns_per_row)
|
7742
|
+
]
|
7743
|
+
)
|
7744
|
+
# Place text annotation with line breaks and rotate the text if needed
|
7745
|
+
ax_dtype_counts.text(
|
7746
|
+
i,
|
7747
|
+
dtype_counts.values[i],
|
7748
|
+
f"{column_name_str}",
|
7749
|
+
ha="center",
|
7750
|
+
va="top",
|
7751
|
+
c="k",
|
7752
|
+
fontsize=8,
|
7753
|
+
rotation=0,
|
7754
|
+
)
|
7755
|
+
figsets(
|
7756
|
+
xlabel=None,
|
7757
|
+
title="Dtypes",
|
7758
|
+
ylabel="#",
|
7759
|
+
ax=ax_dtype_counts
|
7760
|
+
)
|
7761
|
+
|
7762
|
+
# High cardinality: Show top categorical columns by unique value count
|
7763
|
+
high_cardinality = res_qc["high_cardinality_categoricals"]
|
7764
|
+
if high_cardinality and len(high_cardinality) > max_cols:
|
7765
|
+
high_cardinality = dict(
|
7766
|
+
sorted(high_cardinality.items(), key=lambda x: x[1], reverse=True)[
|
7767
|
+
:max_cols
|
7768
|
+
]
|
7769
|
+
)
|
7770
|
+
|
7771
|
+
if high_cardinality:
|
7772
|
+
ax_high_cardinality=sns.barplot(
|
7773
|
+
x=list(high_cardinality.keys()),
|
7774
|
+
y=list(high_cardinality.values()),
|
7775
|
+
hue=list(high_cardinality.keys()),
|
7776
|
+
palette="Oranges", ax=nexttile()
|
7777
|
+
)
|
7778
|
+
figsets(
|
7779
|
+
xangle=45,
|
7780
|
+
title="High Cardinality Categorical Columns",
|
7781
|
+
ylabel="Unique Value Count",
|
7782
|
+
ax=ax_high_cardinality
|
7783
|
+
)
|
7784
|
+
if res_qc["low_variance_features"]:
|
7785
|
+
low_variance_data = data[res_qc["low_variance_features"]].copy()
|
7786
|
+
for col in low_variance_data.columns:
|
7787
|
+
sns.histplot(
|
7788
|
+
low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
|
7789
|
+
)
|
7790
|
+
plt.title(f"Low Variance Feature: {col}")
|
7791
|
+
|
7792
|
+
# VIF plot for multicollinearity detection
|
7793
|
+
if "vif" in res_qc and not res_qc["vif"].empty:
|
7794
|
+
vif_data = res_qc["vif"].sort_values(by="VIF", ascending=False)
|
7795
|
+
if len(vif_data) > max_cols:
|
7796
|
+
vif_data = vif_data[:max_cols]
|
7797
|
+
ax_vif=sns.barplot(data=vif_data,
|
7798
|
+
x="VIF",
|
7799
|
+
y="feature",
|
7800
|
+
hue="VIF",
|
7801
|
+
palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
|
7802
|
+
ax=nexttile())
|
7803
|
+
figsets(
|
7804
|
+
xangle=45,
|
7805
|
+
title="Variance Inflation Factor(VIF)",
|
7806
|
+
xlabel="Variance Inflation Factor(VIF)",
|
7807
|
+
ylabel="Features",
|
7808
|
+
legend=None,
|
7809
|
+
ax=ax_vif
|
7810
|
+
)
|
7811
|
+
|
7812
|
+
# Correlation heatmap for numeric columns with high correlation pairs
|
7813
|
+
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7814
|
+
corr = data.select_dtypes(include=[np.number]).dropna().corr()
|
7815
|
+
if corr.shape[1]<=33:
|
7816
|
+
mask = np.triu(np.ones_like(corr, dtype=bool))
|
7817
|
+
# Dynamically scale fontsize based on the number of columns
|
7818
|
+
num_columns = corr.shape[1]
|
7819
|
+
fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
|
7820
|
+
|
7821
|
+
ax_heatmap=sns.heatmap(
|
7822
|
+
corr,
|
7823
|
+
mask=mask,
|
7824
|
+
annot=True,
|
7825
|
+
cmap="coolwarm",
|
7826
|
+
center=0,
|
7827
|
+
fmt=".2f",
|
7828
|
+
linewidths=0.5,
|
7829
|
+
vmin=-1, vmax=1,
|
7830
|
+
ax=nexttile(2, 2),
|
7831
|
+
cbar_kws=dict(shrink=0.2,ticks=np.arange(-1, 2, 1)),
|
7832
|
+
annot_kws={"size": fontsize}
|
7833
|
+
)
|
7834
|
+
|
7835
|
+
figsets(
|
7836
|
+
xangle=45,
|
7837
|
+
title="Correlation Heatmap",
|
7838
|
+
ax=ax_heatmap
|
7839
|
+
)
|
7130
7840
|
|
7131
7841
|
def use_pd(
|
7132
7842
|
func_name="excel",
|