py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.17__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.git/index +0 -0
- py2ls/ips.py +724 -12
- py2ls/ml2ls copy.py +2906 -0
- py2ls/ml2ls.py +411 -16
- py2ls/plot.py +409 -24
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/METADATA +2 -1
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/RECORD +8 -7
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -2171,6 +2171,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
2171
2171
|
continue
|
2172
2172
|
else:
|
2173
2173
|
pass
|
2174
|
+
if is_df_abnormal(df,verbose=verbose):
|
2175
|
+
df=pd.read_csv(fpath,**kwargs)
|
2174
2176
|
display(df.head(2))
|
2175
2177
|
print(f"shape: {df.shape}")
|
2176
2178
|
return df
|
@@ -3163,14 +3165,19 @@ def listdir(
|
|
3163
3165
|
if kind is None:
|
3164
3166
|
ls = os.listdir(rootdir)
|
3165
3167
|
ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
|
3166
|
-
|
3168
|
+
if verbose:
|
3169
|
+
if len(ls)>20:
|
3170
|
+
print(ls[:20])
|
3171
|
+
else:
|
3172
|
+
print(ls)
|
3167
3173
|
df_all = pd.DataFrame(
|
3168
3174
|
{
|
3169
3175
|
"fname": ls,
|
3170
3176
|
"fpath": [os.path.join(rootdir, i) for i in ls],
|
3171
3177
|
}
|
3172
3178
|
)
|
3173
|
-
|
3179
|
+
if verbose:
|
3180
|
+
display(df_all.head())
|
3174
3181
|
return df_all
|
3175
3182
|
if isinstance(kind, list):
|
3176
3183
|
f_ = []
|
@@ -3206,6 +3213,7 @@ def listdir(
|
|
3206
3213
|
"size": [],
|
3207
3214
|
"fname": [],
|
3208
3215
|
"fpath": [],
|
3216
|
+
"basename":[],
|
3209
3217
|
}
|
3210
3218
|
for item in ls:
|
3211
3219
|
item_path = os.path.join(rootdir, item)
|
@@ -3228,6 +3236,7 @@ def listdir(
|
|
3228
3236
|
f["length"].append(len(filename))
|
3229
3237
|
f["path"].append(os.path.join(os.path.dirname(item_path), item))
|
3230
3238
|
fpath = os.path.join(os.path.dirname(item_path), item)
|
3239
|
+
basename=os.path.basename(item_path)
|
3231
3240
|
f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
|
3232
3241
|
f["created_time"].append(
|
3233
3242
|
pd.to_datetime(os.path.getctime(item_path), unit="s")
|
@@ -3240,6 +3249,7 @@ def listdir(
|
|
3240
3249
|
)
|
3241
3250
|
f["fname"].append(filename) # will be removed
|
3242
3251
|
f["fpath"].append(fpath) # will be removed
|
3252
|
+
f['basename'].append(basename)
|
3243
3253
|
i += 1
|
3244
3254
|
|
3245
3255
|
f["num"] = i
|
@@ -3462,7 +3472,6 @@ def figsave(*args, dpi=300):
|
|
3462
3472
|
img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
|
3463
3473
|
elif isinstance(img, np.ndarray):
|
3464
3474
|
import cv2
|
3465
|
-
|
3466
3475
|
# Check the shape of the image to determine color mode
|
3467
3476
|
if img.ndim == 2:
|
3468
3477
|
# Grayscale image
|
@@ -5055,16 +5064,22 @@ def _df_outlier(
|
|
5055
5064
|
from scipy.stats import zscore
|
5056
5065
|
from sklearn.ensemble import IsolationForest
|
5057
5066
|
from sklearn.preprocessing import StandardScaler
|
5058
|
-
|
5067
|
+
|
5068
|
+
# Fill completely NaN columns with a default value (e.g., 0)
|
5069
|
+
data = data.copy()
|
5070
|
+
data.loc[:, data.isna().all()] = 0
|
5071
|
+
if columns is not None:
|
5072
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
5073
|
+
data=data[columns]
|
5059
5074
|
col_names_org = data.columns.tolist()
|
5060
5075
|
index_names_org = data.index.tolist()
|
5061
5076
|
# Separate numeric and non-numeric columns
|
5062
5077
|
numeric_data = data.select_dtypes(include=[np.number])
|
5063
5078
|
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
5064
5079
|
|
5065
|
-
if columns is not None:
|
5066
|
-
|
5067
|
-
|
5080
|
+
# if columns is not None:
|
5081
|
+
# numeric_data = numeric_data[columns]
|
5082
|
+
if numeric_data.empty:
|
5068
5083
|
raise ValueError("Input data must contain numeric columns.")
|
5069
5084
|
|
5070
5085
|
outliers_df = pd.DataFrame(index=numeric_data.index)
|
@@ -5626,6 +5641,10 @@ def df_fillna(
|
|
5626
5641
|
for col in data.columns:
|
5627
5642
|
data[col] = data[col].apply(lambda x: np.nan if x is None else x)
|
5628
5643
|
|
5644
|
+
# Fill completely NaN columns with a default value (e.g., 0)
|
5645
|
+
data = data.copy()
|
5646
|
+
data.loc[:, data.isna().all()] = 0
|
5647
|
+
|
5629
5648
|
col_names_org = data.columns.tolist()
|
5630
5649
|
index_names_org = data.index.tolist()
|
5631
5650
|
# Separate numeric and non-numeric columns
|
@@ -5682,11 +5701,11 @@ def df_fillna(
|
|
5682
5701
|
imputed_data = imputer.fit_transform(numeric_data.T)
|
5683
5702
|
else:
|
5684
5703
|
raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
|
5685
|
-
|
5704
|
+
|
5686
5705
|
imputed_data = pd.DataFrame(
|
5687
5706
|
imputed_data if axis == 0 else imputed_data.T,
|
5688
|
-
index=numeric_data.index if axis == 0 else
|
5689
|
-
columns=numeric_data.columns if axis == 0 else
|
5707
|
+
index=numeric_data.index if axis == 0 else numeric_data.columns,
|
5708
|
+
columns=numeric_data.columns if axis == 0 else numeric_data.index,
|
5690
5709
|
)
|
5691
5710
|
for col in imputed_data.select_dtypes(include=[np.number]).columns:
|
5692
5711
|
imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
|
@@ -5826,8 +5845,13 @@ def df_encoder(
|
|
5826
5845
|
from sklearn.preprocessing import LabelEncoder
|
5827
5846
|
|
5828
5847
|
encoder = LabelEncoder()
|
5829
|
-
|
5830
|
-
|
5848
|
+
# Apply LabelEncoder only to non-numeric columns
|
5849
|
+
non_numeric_columns = [col for col in columns if not pd.api.types.is_numeric_dtype(data[col])]
|
5850
|
+
|
5851
|
+
if not non_numeric_columns:
|
5852
|
+
return data
|
5853
|
+
encoded_data = data[non_numeric_columns].apply(lambda col: encoder.fit_transform(col))
|
5854
|
+
return pd.concat([data.drop(non_numeric_columns, axis=1), encoded_data], axis=1)
|
5831
5855
|
|
5832
5856
|
# Target encoding (Mean of the target for each category)
|
5833
5857
|
elif method == "target":
|
@@ -6878,7 +6902,188 @@ def df_reducer(
|
|
6878
6902
|
# example:
|
6879
6903
|
# df_reducer(data=data_log, columns=markers, n_components=2)
|
6880
6904
|
|
6905
|
+
def df_format(data, threshold_unique=0.5, verbose=False):
|
6906
|
+
"""
|
6907
|
+
检测表格: long, wide or uncertain.
|
6908
|
+
|
6909
|
+
Parameters:
|
6910
|
+
- data (pd.DataFrame): DataFrame to check.
|
6911
|
+
- threshold_unique (float): Proportion threshold for detecting categorical columns.
|
6912
|
+
|
6913
|
+
Returns:
|
6914
|
+
- "long" if detected as long format,
|
6915
|
+
- "wide" if detected as wide format
|
6916
|
+
- "uncertain" if ambiguous.
|
6917
|
+
"""
|
6918
|
+
from scipy.stats import entropy
|
6919
|
+
from sklearn.cluster import AgglomerativeClustering
|
6920
|
+
from sklearn.preprocessing import StandardScaler
|
6921
|
+
|
6922
|
+
long_score = 0
|
6923
|
+
wide_score = 0
|
6924
|
+
|
6925
|
+
n_rows, n_cols = data.shape
|
6926
|
+
|
6927
|
+
# Step 1: Row-Column Ratio Heuristic
|
6928
|
+
if n_rows > 3 * n_cols:
|
6929
|
+
long_score += 2
|
6930
|
+
if verbose:
|
6931
|
+
print(
|
6932
|
+
"Row-Column Ratio suggests long format (many rows relative to columns)."
|
6933
|
+
)
|
6934
|
+
elif n_cols > 3 * n_rows:
|
6935
|
+
wide_score += 2
|
6936
|
+
if verbose:
|
6937
|
+
print(
|
6938
|
+
"Row-Column Ratio suggests wide format (many columns relative to rows)."
|
6939
|
+
)
|
6940
|
+
|
6941
|
+
# Step 2: Unique-to-duplicate ratio and entropy for categorical variables
|
6942
|
+
unique_counts = data.apply(lambda x: x.nunique())
|
6943
|
+
duplicate_ratio = 1 - unique_counts / n_rows
|
6944
|
+
if (duplicate_ratio > 0.2).sum() > 0.5 * n_cols:
|
6945
|
+
wide_score += 2
|
6946
|
+
if verbose:
|
6947
|
+
print("High duplicate values in columns suggest wide format.")
|
6948
|
+
else:
|
6949
|
+
long_score += 1
|
6950
|
+
if verbose:
|
6951
|
+
print(
|
6952
|
+
"Lower duplicate ratio suggests long format (higher row variability)."
|
6953
|
+
)
|
6881
6954
|
|
6955
|
+
# Calculate entropy for categorical columns
|
6956
|
+
categorical_cols = data.select_dtypes(include=["object", "category"]).columns
|
6957
|
+
if len(categorical_cols) > 0:
|
6958
|
+
for col in categorical_cols:
|
6959
|
+
counts = data[col].value_counts(normalize=True)
|
6960
|
+
col_entropy = entropy(counts)
|
6961
|
+
if col_entropy < 1.5:
|
6962
|
+
long_score += 1
|
6963
|
+
if verbose:
|
6964
|
+
print(
|
6965
|
+
f"Column '{col}' entropy suggests categorical, supporting long format."
|
6966
|
+
)
|
6967
|
+
else:
|
6968
|
+
wide_score += 1
|
6969
|
+
if verbose:
|
6970
|
+
print(f"Column '{col}' entropy is higher, supporting wide format.")
|
6971
|
+
|
6972
|
+
# Step 3: Column grouping analysis for patterns in suffixes/prefixes
|
6973
|
+
col_names = data.columns.astype(str)
|
6974
|
+
suffix_count = sum("_" in col or col[-1].isdigit() for col in col_names)
|
6975
|
+
if suffix_count > 0.3 * n_cols:
|
6976
|
+
wide_score += 2
|
6977
|
+
if verbose:
|
6978
|
+
print(
|
6979
|
+
"Detected suffix/prefix patterns in column names, suggesting wide format."
|
6980
|
+
)
|
6981
|
+
|
6982
|
+
# Step 4: Entity identifier detection for long format with categorical columns
|
6983
|
+
if len(categorical_cols) > 0 and n_rows > n_cols:
|
6984
|
+
entity_identifier_count = sum(
|
6985
|
+
data.duplicated(subset=categorical_cols, keep=False)
|
6986
|
+
)
|
6987
|
+
if entity_identifier_count > 0.2 * n_rows:
|
6988
|
+
long_score += 2
|
6989
|
+
if verbose:
|
6990
|
+
print(
|
6991
|
+
"Significant duplicate rows based on categorical columns, suggesting long format."
|
6992
|
+
)
|
6993
|
+
|
6994
|
+
# Step 5: Clustering analysis on numerical columns for correlation in wide format
|
6995
|
+
numeric_cols = data.select_dtypes(include="number").columns
|
6996
|
+
if len(numeric_cols) > 1:
|
6997
|
+
scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
|
6998
|
+
clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
|
6999
|
+
cluster_labels = pd.Series(clustering.labels_)
|
7000
|
+
if cluster_labels.nunique() < len(numeric_cols) * 0.5:
|
7001
|
+
wide_score += 2
|
7002
|
+
if verbose:
|
7003
|
+
print("Clustering on columns shows grouping, suggesting wide format.")
|
7004
|
+
|
7005
|
+
# Step 6: Inter-column correlation analysis
|
7006
|
+
if len(numeric_cols) > 1:
|
7007
|
+
corr_matrix = data[numeric_cols].corr().abs()
|
7008
|
+
avg_corr = (
|
7009
|
+
corr_matrix.where(~np.eye(len(corr_matrix), dtype=bool)).mean().mean()
|
7010
|
+
)
|
7011
|
+
if avg_corr > 0.6:
|
7012
|
+
wide_score += 2
|
7013
|
+
if verbose:
|
7014
|
+
print("High inter-column correlation suggests wide format.")
|
7015
|
+
|
7016
|
+
# Step 7: Missing value pattern analysis
|
7017
|
+
missing_patterns = data.isna().sum(axis=1)
|
7018
|
+
if missing_patterns.std() < 2:
|
7019
|
+
wide_score += 1
|
7020
|
+
if verbose:
|
7021
|
+
print(
|
7022
|
+
"Low variation in missing patterns across rows, supporting wide format."
|
7023
|
+
)
|
7024
|
+
elif missing_patterns.mean() < 1:
|
7025
|
+
long_score += 1
|
7026
|
+
if verbose:
|
7027
|
+
print("Lower missing pattern suggests long format (less structured).")
|
7028
|
+
|
7029
|
+
# Step 8: Multi-level clustering on rows to detect block structure for wide format
|
7030
|
+
if len(numeric_cols) > 1 and n_rows > 5:
|
7031
|
+
clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
|
7032
|
+
if pd.Series(clustering_rows.labels_).nunique() < 2:
|
7033
|
+
wide_score += 2
|
7034
|
+
if verbose:
|
7035
|
+
print("Row clustering reveals homogeneity, suggesting wide format.")
|
7036
|
+
|
7037
|
+
# Step 9: Sequential name detection for time-series pattern in wide format
|
7038
|
+
if any(col.isdigit() or col.startswith("T") for col in col_names):
|
7039
|
+
wide_score += 1
|
7040
|
+
if verbose:
|
7041
|
+
print("Detected time-like sequential column names, supporting wide format.")
|
7042
|
+
|
7043
|
+
# Step 10: Entropy of numeric columns
|
7044
|
+
numeric_entropy = data[numeric_cols].apply(
|
7045
|
+
lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
|
7046
|
+
)
|
7047
|
+
if numeric_entropy.mean() < 2:
|
7048
|
+
wide_score += 2
|
7049
|
+
if verbose:
|
7050
|
+
print(
|
7051
|
+
"Low entropy in numeric columns indicates stability across columns, supporting wide format."
|
7052
|
+
)
|
7053
|
+
|
7054
|
+
# Step 11: Tie-breaking strategy if scores are equal
|
7055
|
+
if wide_score == long_score:
|
7056
|
+
if n_cols > n_rows:
|
7057
|
+
wide_score += 1
|
7058
|
+
if verbose:
|
7059
|
+
print(
|
7060
|
+
"Tie-breaking based on column-major structure, favoring wide format."
|
7061
|
+
)
|
7062
|
+
elif n_rows > n_cols:
|
7063
|
+
long_score += 1
|
7064
|
+
if verbose:
|
7065
|
+
print(
|
7066
|
+
"Tie-breaking based on row-major structure, favoring long format."
|
7067
|
+
)
|
7068
|
+
else:
|
7069
|
+
if verbose:
|
7070
|
+
print("Tie-breaking inconclusive; returning 'uncertain'.")
|
7071
|
+
return "uncertain"
|
7072
|
+
|
7073
|
+
# Final decision
|
7074
|
+
if wide_score > long_score:
|
7075
|
+
if verbose:
|
7076
|
+
print("Final decision: Wide format.")
|
7077
|
+
return "wide"
|
7078
|
+
elif long_score > wide_score:
|
7079
|
+
if verbose:
|
7080
|
+
print("Final decision: Long format.")
|
7081
|
+
return "long"
|
7082
|
+
else:
|
7083
|
+
if verbose:
|
7084
|
+
print("Final decision: Uncertain format.")
|
7085
|
+
return "uncertain"
|
7086
|
+
|
6882
7087
|
def plot_cluster(
|
6883
7088
|
data: pd.DataFrame,
|
6884
7089
|
labels: np.ndarray,
|
@@ -7126,7 +7331,514 @@ def evaluate_cluster(
|
|
7126
7331
|
metrics["V-Measure"] = np.nan
|
7127
7332
|
|
7128
7333
|
return metrics
|
7334
|
+
def df_qc(
|
7335
|
+
data: pd.DataFrame,
|
7336
|
+
columns=None,
|
7337
|
+
verbose=False,
|
7338
|
+
plot_=True,
|
7339
|
+
max_cols=20, # only for plots
|
7340
|
+
output=False,
|
7341
|
+
):
|
7342
|
+
"""
|
7343
|
+
Usage example:
|
7344
|
+
df = pd.DataFrame(...) # Your DataFrameres_qc = df_qc(df)
|
7345
|
+
"""
|
7346
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
7347
|
+
from scipy.stats import skew, kurtosis, entropy
|
7348
|
+
import skimpy
|
7349
|
+
|
7350
|
+
#! display(data.select_dtypes(include=[np.number]).describe())
|
7351
|
+
#!skim
|
7352
|
+
if columns is not None:
|
7353
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7354
|
+
data=data[columns]
|
7355
|
+
try:
|
7356
|
+
skimpy.skim(data)
|
7357
|
+
except:
|
7358
|
+
numerical_data = data.select_dtypes(include=[np.number])
|
7359
|
+
skimpy.skim(numerical_data)
|
7360
|
+
# Fill completely NaN columns with a default value (e.g., 0)
|
7361
|
+
data = data.copy()
|
7362
|
+
data.loc[:, data.isna().all()] = 0
|
7363
|
+
res_qc = {}
|
7364
|
+
|
7365
|
+
# Missing values
|
7366
|
+
res_qc["missing_values"] = data.isnull().sum()
|
7367
|
+
res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
|
7368
|
+
res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
|
7369
|
+
|
7370
|
+
# Data types and unique values
|
7371
|
+
res_qc["data_types"] = data.dtypes
|
7372
|
+
res_qc["unique_values"] = data.nunique()
|
7373
|
+
res_qc["constant_columns"] = [
|
7374
|
+
col for col in data.columns if data[col].nunique() <= 1
|
7375
|
+
]
|
7376
|
+
|
7377
|
+
# Duplicate rows and columns
|
7378
|
+
res_qc["duplicate_rows"] = data.duplicated().sum()
|
7379
|
+
res_qc["duplicate_columns"] = data.columns[data.columns.duplicated()].tolist()
|
7380
|
+
|
7381
|
+
# Empty columns
|
7382
|
+
res_qc["empty_columns"] = [col for col in data.columns if data[col].isnull().all()]
|
7383
|
+
|
7384
|
+
# outliers
|
7385
|
+
data_outliers = df_outlier(data)
|
7386
|
+
outlier_num = data_outliers.isna().sum() - data.isnull().sum()
|
7387
|
+
res_qc["outlier_num"] = outlier_num[outlier_num > 0]
|
7388
|
+
outlier_percentage=(outlier_num / len(data_outliers)) * 100
|
7389
|
+
res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
|
7390
|
+
# Correlation and multicollinearity (VIF)
|
7391
|
+
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7392
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7393
|
+
corr_matrix = numeric_df.corr()
|
7394
|
+
high_corr_pairs = [
|
7395
|
+
(col1, col2)
|
7396
|
+
for col1 in corr_matrix.columns
|
7397
|
+
for col2 in corr_matrix.columns
|
7398
|
+
if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
|
7399
|
+
]
|
7400
|
+
res_qc["high_correlations"] = high_corr_pairs
|
7401
|
+
|
7402
|
+
# VIF for multicollinearity check
|
7403
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7404
|
+
vif_data = pd.DataFrame()
|
7405
|
+
res_qc["vif"]=vif_data
|
7406
|
+
if numeric_df.shape[1] > 1:
|
7407
|
+
vif_data["feature"] = numeric_df.columns
|
7408
|
+
vif_data["VIF"] = [
|
7409
|
+
variance_inflation_factor(numeric_df.values, i)
|
7410
|
+
for i in range(numeric_df.shape[1])
|
7411
|
+
]
|
7412
|
+
res_qc["vif"] = vif_data[
|
7413
|
+
vif_data["VIF"] > 5
|
7414
|
+
] # Typically VIF > 5 indicates multicollinearity
|
7415
|
+
# Skewness and Kurtosis
|
7416
|
+
skewness = data.skew(numeric_only=True)
|
7417
|
+
kurtosis_vals = data.kurt(numeric_only=True)
|
7418
|
+
res_qc["skewness"] = skewness[abs(skewness) > 1]
|
7419
|
+
res_qc["kurtosis"] = kurtosis_vals[abs(kurtosis_vals) > 3]
|
7420
|
+
|
7421
|
+
# Entropy for categorical columns (higher entropy suggests more disorder)
|
7422
|
+
categorical_cols = data.select_dtypes(include=["object", "category"]).columns
|
7423
|
+
res_qc["entropy_categoricals"] = {
|
7424
|
+
col: entropy(data[col].value_counts(normalize=True), base=2)
|
7425
|
+
for col in categorical_cols
|
7426
|
+
}
|
7427
|
+
# number of unique
|
7428
|
+
res_qc["unique_counts"] = data.nunique()
|
7429
|
+
# dtypes counts
|
7430
|
+
res_qc['dtype_counts']=data.dtypes.value_counts()
|
7431
|
+
|
7432
|
+
# Distribution Analysis (mean, median, mode, std dev, IQR for numeric columns)
|
7433
|
+
distribution_stats = data.select_dtypes(include=[np.number]).describe().T
|
7434
|
+
iqr = data.select_dtypes(include=[np.number]).apply(
|
7435
|
+
lambda x: x.quantile(0.75) - x.quantile(0.25)
|
7436
|
+
)
|
7437
|
+
distribution_stats["IQR"] = iqr
|
7438
|
+
res_qc["distribution_analysis"] = distribution_stats
|
7439
|
+
|
7440
|
+
# Variance Check: Identify low-variance columns
|
7441
|
+
variance_threshold = 0.01
|
7442
|
+
low_variance_cols = [
|
7443
|
+
col
|
7444
|
+
for col in data.select_dtypes(include=[np.number]).columns
|
7445
|
+
if data[col].var() < variance_threshold
|
7446
|
+
]
|
7447
|
+
res_qc["low_variance_features"] = low_variance_cols
|
7448
|
+
|
7449
|
+
# Categorical columns and cardinality
|
7450
|
+
categorical_cols = data.select_dtypes(include=["object", "category"]).columns
|
7451
|
+
high_cardinality = {
|
7452
|
+
col: data[col].nunique() for col in categorical_cols if data[col].nunique() > 50
|
7453
|
+
}
|
7454
|
+
res_qc["high_cardinality_categoricals"] = high_cardinality
|
7455
|
+
|
7456
|
+
# Feature-type inconsistency (mixed types in columns)
|
7457
|
+
inconsistent_types = {}
|
7458
|
+
for col in data.columns:
|
7459
|
+
unique_types = set(type(val) for val in data[col].dropna())
|
7460
|
+
if len(unique_types) > 1:
|
7461
|
+
inconsistent_types[col] = unique_types
|
7462
|
+
res_qc["inconsistent_types"] = inconsistent_types
|
7463
|
+
|
7464
|
+
|
7465
|
+
# Text length analysis for text fields
|
7466
|
+
text_lengths = {}
|
7467
|
+
for col in categorical_cols:
|
7468
|
+
text_lengths[col] = {
|
7469
|
+
"avg_length": data[col].dropna().apply(len).mean(),
|
7470
|
+
"length_variance": data[col].dropna().apply(len).var(),
|
7471
|
+
}
|
7472
|
+
res_qc["text_length_analysis"] = text_lengths
|
7473
|
+
|
7474
|
+
# Summary statistics
|
7475
|
+
res_qc["summary_statistics"] = data.describe().T
|
7476
|
+
|
7477
|
+
# Automated warnings
|
7478
|
+
warnings = []
|
7479
|
+
if res_qc["duplicate_rows"] > 0:
|
7480
|
+
warnings.append("Warning: Duplicate rows detected.")
|
7481
|
+
if len(res_qc["empty_columns"]) > 0:
|
7482
|
+
warnings.append("Warning: Columns with only NaN values detected.")
|
7483
|
+
if len(res_qc["constant_columns"]) > 0:
|
7484
|
+
warnings.append("Warning: Columns with a single constant value detected.")
|
7485
|
+
if len(high_corr_pairs) > 0:
|
7486
|
+
warnings.append("Warning: Highly correlated columns detected.")
|
7487
|
+
if len(res_qc["vif"]) > 0:
|
7488
|
+
warnings.append("Warning: Multicollinearity detected in features.")
|
7489
|
+
if len(high_cardinality) > 0:
|
7490
|
+
warnings.append("Warning: High cardinality in categorical columns.")
|
7491
|
+
if len(inconsistent_types) > 0:
|
7492
|
+
warnings.append("Warning: Columns with mixed data types detected.")
|
7493
|
+
res_qc["warnings"] = warnings
|
7494
|
+
|
7495
|
+
# Report generation
|
7496
|
+
if verbose:
|
7497
|
+
print("=== QC Report Summary ===")
|
7498
|
+
print("\nMissing Values (Total and %):")
|
7499
|
+
print(res_qc["missing_values"][res_qc["missing_values"] > 0])
|
7500
|
+
print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
|
7501
|
+
|
7502
|
+
print("\nRows with Missing Values:", res_qc["rows_with_missing"])
|
7503
|
+
|
7504
|
+
print("\nData Types:")
|
7505
|
+
print(res_qc["data_types"])
|
7506
|
+
|
7507
|
+
print("\nUnique Values per Column:")
|
7508
|
+
print(res_qc["unique_values"])
|
7509
|
+
|
7510
|
+
print("\nConstant Columns:", res_qc["constant_columns"])
|
7511
|
+
|
7512
|
+
print("\nDuplicate Rows:", res_qc["duplicate_rows"])
|
7513
|
+
print("Duplicate Columns:", res_qc["duplicate_columns"])
|
7514
|
+
|
7515
|
+
if res_qc["empty_columns"]:
|
7516
|
+
print("\nEmpty Columns:", res_qc["empty_columns"])
|
7517
|
+
|
7518
|
+
print("\nOutlier Report:")
|
7519
|
+
print(res_qc["outlier_num"])
|
7520
|
+
print("\nPercentage of Values Replaced per Column:")
|
7521
|
+
print(res_qc["outlier_percentage"])
|
7522
|
+
|
7523
|
+
print("\nHigh Correlations (>|0.9|):")
|
7524
|
+
for col1, col2 in res_qc["high_correlations"]:
|
7525
|
+
print(f" {col1} and {col2}")
|
7526
|
+
|
7527
|
+
if "vif" in res_qc:
|
7528
|
+
print("\nFeatures with High VIF (>|5|):")
|
7529
|
+
print(res_qc["vif"])
|
7530
|
+
|
7531
|
+
print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
|
7532
|
+
print(res_qc["high_cardinality_categoricals"])
|
7533
|
+
|
7534
|
+
print("\nInconsistent Data Types:")
|
7535
|
+
print(res_qc["inconsistent_types"])
|
7536
|
+
|
7537
|
+
print("\nRange Checks for Numeric Columns:")
|
7538
|
+
print(res_qc["range_checks"])
|
7539
|
+
|
7540
|
+
print("\nText Length Analysis:")
|
7541
|
+
for col, stats in res_qc["text_length_analysis"].items():
|
7542
|
+
print(
|
7543
|
+
f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
|
7544
|
+
)
|
7545
|
+
|
7546
|
+
print("\nSummary Statistics:")
|
7547
|
+
print(res_qc["summary_statistics"])
|
7129
7548
|
|
7549
|
+
if res_qc["warnings"]:
|
7550
|
+
print("\nWarnings:")
|
7551
|
+
for warning in res_qc["warnings"]:
|
7552
|
+
print(" -", warning)
|
7553
|
+
if plot_:
|
7554
|
+
df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
|
7555
|
+
if output:
|
7556
|
+
return res_qc
|
7557
|
+
return None
|
7558
|
+
|
7559
|
+
|
7560
|
+
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
|
7561
|
+
import matplotlib.pyplot as plt
|
7562
|
+
import seaborn as sns
|
7563
|
+
from .plot import subplot, figsets, get_color
|
7564
|
+
|
7565
|
+
if columns is not None:
|
7566
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7567
|
+
data=data[columns]
|
7568
|
+
len_total = len(res_qc)
|
7569
|
+
n_row, n_col = int((len_total + 10) / 3), 3
|
7570
|
+
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
7571
|
+
|
7572
|
+
missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
|
7573
|
+
ascending=False
|
7574
|
+
)
|
7575
|
+
if len(missing_data) > max_cols:
|
7576
|
+
missing_data = missing_data[:max_cols]
|
7577
|
+
ax=sns.barplot(
|
7578
|
+
x=missing_data.index,
|
7579
|
+
y=missing_data.values,
|
7580
|
+
hue=missing_data.index,
|
7581
|
+
palette=get_color(len(missing_data), cmap="Blues")[::-1],
|
7582
|
+
ax=nexttile(),
|
7583
|
+
)
|
7584
|
+
figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
|
7585
|
+
|
7586
|
+
ax2 = ax.twinx()
|
7587
|
+
# Plot missing value percentages
|
7588
|
+
missing_percentage = res_qc["missing_percentage"][
|
7589
|
+
res_qc["missing_percentage"] > 0
|
7590
|
+
].sort_values(ascending=False)
|
7591
|
+
sns.barplot(
|
7592
|
+
x=missing_percentage.index,
|
7593
|
+
y=missing_percentage.values,
|
7594
|
+
hue=missing_percentage.index,
|
7595
|
+
palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
|
7596
|
+
ax=ax2,#nexttile(),
|
7597
|
+
)
|
7598
|
+
figsets(xangle=45, ylabel="%",ax=ax2)
|
7599
|
+
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7600
|
+
ax2.yaxis.label.set_color('r')
|
7601
|
+
|
7602
|
+
outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
|
7603
|
+
if len(outlier_num) > max_cols:
|
7604
|
+
outlier_num = outlier_num[:max_cols]
|
7605
|
+
ax_outlier_num=sns.barplot(
|
7606
|
+
x=outlier_num.index,
|
7607
|
+
y=outlier_num.values,
|
7608
|
+
hue=outlier_num.index,
|
7609
|
+
palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
|
7610
|
+
ax=nexttile(),
|
7611
|
+
)
|
7612
|
+
figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
|
7613
|
+
ax_outlier_percentage = ax_outlier_num.twinx()
|
7614
|
+
outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
|
7615
|
+
if len(outlier_percentage) > max_cols:
|
7616
|
+
outlier_percentage = outlier_percentage[:max_cols]
|
7617
|
+
ax_outlier_percentage=sns.barplot(
|
7618
|
+
x=outlier_percentage.index,
|
7619
|
+
y=outlier_percentage.values,
|
7620
|
+
hue=outlier_percentage.index,
|
7621
|
+
palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
|
7622
|
+
ax=ax2 #nexttile(),
|
7623
|
+
)
|
7624
|
+
figsets(
|
7625
|
+
xangle=45,
|
7626
|
+
ylabel="%",
|
7627
|
+
xlabel=None,
|
7628
|
+
ylim=[0, outlier_percentage.max() + 2],
|
7629
|
+
ax=ax_outlier_percentage
|
7630
|
+
)
|
7631
|
+
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7632
|
+
ax2.yaxis.label.set_color('r')
|
7633
|
+
|
7634
|
+
# Skewness and Kurtosis Plots
|
7635
|
+
skewness = res_qc["skewness"].sort_values(ascending=False)
|
7636
|
+
kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
|
7637
|
+
if not skewness.empty:
|
7638
|
+
ax_skewness=sns.barplot(
|
7639
|
+
x=skewness.index,
|
7640
|
+
y=skewness.values,
|
7641
|
+
hue=skewness.index,
|
7642
|
+
palette=get_color(len(skewness), cmap="coolwarm")[::-1],
|
7643
|
+
ax=nexttile(),
|
7644
|
+
)
|
7645
|
+
figsets(
|
7646
|
+
xangle=45,
|
7647
|
+
title="Highly Skewed Numeric Columns (Skewness > 1)",
|
7648
|
+
ylabel="Skewness",xlabel=None,ax=ax_skewness
|
7649
|
+
)
|
7650
|
+
if not kurtosis.empty:
|
7651
|
+
ax_kurtosis=sns.barplot(
|
7652
|
+
x=kurtosis.index,
|
7653
|
+
y=kurtosis.values,
|
7654
|
+
hue=kurtosis.index,
|
7655
|
+
palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
|
7656
|
+
ax=nexttile(),
|
7657
|
+
)
|
7658
|
+
figsets(
|
7659
|
+
xangle=45,
|
7660
|
+
title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
|
7661
|
+
ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
|
7662
|
+
)
|
7663
|
+
|
7664
|
+
# Entropy for Categorical Variables
|
7665
|
+
entropy_data = pd.Series(res_qc["entropy_categoricals"]).sort_values(
|
7666
|
+
ascending=False
|
7667
|
+
)
|
7668
|
+
ax_entropy_data=sns.barplot(
|
7669
|
+
x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
|
7670
|
+
)
|
7671
|
+
figsets(
|
7672
|
+
xangle=45,
|
7673
|
+
xlabel="Categorical Columns",
|
7674
|
+
title="Entropy of Categorical Variables",
|
7675
|
+
ylabel="Entropy (bits)",
|
7676
|
+
ax=ax_entropy_data
|
7677
|
+
)
|
7678
|
+
# Distribution Analysis: Boxplot for IQR
|
7679
|
+
ax_iqr=sns.boxplot(
|
7680
|
+
data=data[res_qc["distribution_analysis"].index],
|
7681
|
+
orient="v",
|
7682
|
+
palette="Set3",
|
7683
|
+
ax=nexttile(),
|
7684
|
+
)
|
7685
|
+
figsets(
|
7686
|
+
xangle=45,
|
7687
|
+
title="Range for Numeric Columns",
|
7688
|
+
ylabel="#",
|
7689
|
+
ax=ax_iqr
|
7690
|
+
)
|
7691
|
+
# unique counts
|
7692
|
+
unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
|
7693
|
+
ax_unique_counts_=sns.barplot(
|
7694
|
+
x=unique_counts.index,
|
7695
|
+
y=unique_counts.values,
|
7696
|
+
hue=unique_counts.index,
|
7697
|
+
palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
|
7698
|
+
ax=nexttile())
|
7699
|
+
figsets(
|
7700
|
+
xangle=45,
|
7701
|
+
title="Unique Counts",
|
7702
|
+
xlabel=None,
|
7703
|
+
ylabel="#",
|
7704
|
+
ax=ax_unique_counts_
|
7705
|
+
)
|
7706
|
+
# Binary Checking
|
7707
|
+
ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
|
7708
|
+
y=unique_counts[unique_counts<10].values,
|
7709
|
+
hue=unique_counts[unique_counts<10].index,
|
7710
|
+
palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
|
7711
|
+
ax=nexttile())
|
7712
|
+
plt.axhline(y=2, color="r", linestyle="--", lw=2)
|
7713
|
+
figsets(
|
7714
|
+
xangle=45,
|
7715
|
+
xlabel=None,
|
7716
|
+
title="Binary Checking",
|
7717
|
+
ylabel="#",
|
7718
|
+
ax=ax_unique_counts
|
7719
|
+
)
|
7720
|
+
|
7721
|
+
# dtypes counts
|
7722
|
+
dtype_counts = res_qc['dtype_counts']
|
7723
|
+
txt = []
|
7724
|
+
for tp in dtype_counts.index:
|
7725
|
+
txt.append(list(data.select_dtypes(include=tp).columns))
|
7726
|
+
|
7727
|
+
ax_dtype_counts = sns.barplot(
|
7728
|
+
x=dtype_counts.index,
|
7729
|
+
y=dtype_counts.values,
|
7730
|
+
color="#F3C8B2",
|
7731
|
+
ax=nexttile(),
|
7732
|
+
)
|
7733
|
+
max_columns_per_row = 1 # Maximum number of columns per row
|
7734
|
+
for i, tp in enumerate(dtype_counts.index):
|
7735
|
+
if i<=20:
|
7736
|
+
column_names = txt[i]
|
7737
|
+
# Split the column names into multiple lines if too long
|
7738
|
+
column_name_str = ", ".join(column_names)
|
7739
|
+
if len(column_name_str) > 40: # If column names are too long, split them
|
7740
|
+
column_name_str = "\n".join(
|
7741
|
+
[
|
7742
|
+
", ".join(column_names[j : j + max_columns_per_row])
|
7743
|
+
for j in range(0, len(column_names), max_columns_per_row)
|
7744
|
+
]
|
7745
|
+
)
|
7746
|
+
# Place text annotation with line breaks and rotate the text if needed
|
7747
|
+
ax_dtype_counts.text(
|
7748
|
+
i,
|
7749
|
+
dtype_counts.values[i],
|
7750
|
+
f"{column_name_str}",
|
7751
|
+
ha="center",
|
7752
|
+
va="top",
|
7753
|
+
c="k",
|
7754
|
+
fontsize=8,
|
7755
|
+
rotation=0,
|
7756
|
+
)
|
7757
|
+
figsets(
|
7758
|
+
xlabel=None,
|
7759
|
+
title="Dtypes",
|
7760
|
+
ylabel="#",
|
7761
|
+
ax=ax_dtype_counts
|
7762
|
+
)
|
7763
|
+
|
7764
|
+
# High cardinality: Show top categorical columns by unique value count
|
7765
|
+
high_cardinality = res_qc["high_cardinality_categoricals"]
|
7766
|
+
if high_cardinality and len(high_cardinality) > max_cols:
|
7767
|
+
high_cardinality = dict(
|
7768
|
+
sorted(high_cardinality.items(), key=lambda x: x[1], reverse=True)[
|
7769
|
+
:max_cols
|
7770
|
+
]
|
7771
|
+
)
|
7772
|
+
|
7773
|
+
if high_cardinality:
|
7774
|
+
ax_high_cardinality=sns.barplot(
|
7775
|
+
x=list(high_cardinality.keys()),
|
7776
|
+
y=list(high_cardinality.values()),
|
7777
|
+
hue=list(high_cardinality.keys()),
|
7778
|
+
palette="Oranges", ax=nexttile()
|
7779
|
+
)
|
7780
|
+
figsets(
|
7781
|
+
xangle=45,
|
7782
|
+
title="High Cardinality Categorical Columns",
|
7783
|
+
ylabel="Unique Value Count",
|
7784
|
+
ax=ax_high_cardinality
|
7785
|
+
)
|
7786
|
+
if res_qc["low_variance_features"]:
|
7787
|
+
low_variance_data = data[res_qc["low_variance_features"]].copy()
|
7788
|
+
for col in low_variance_data.columns:
|
7789
|
+
sns.histplot(
|
7790
|
+
low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
|
7791
|
+
)
|
7792
|
+
plt.title(f"Low Variance Feature: {col}")
|
7793
|
+
|
7794
|
+
# VIF plot for multicollinearity detection
|
7795
|
+
if "vif" in res_qc and not res_qc["vif"].empty:
|
7796
|
+
vif_data = res_qc["vif"].sort_values(by="VIF", ascending=False)
|
7797
|
+
if len(vif_data) > max_cols:
|
7798
|
+
vif_data = vif_data[:max_cols]
|
7799
|
+
ax_vif=sns.barplot(data=vif_data,
|
7800
|
+
x="VIF",
|
7801
|
+
y="feature",
|
7802
|
+
hue="VIF",
|
7803
|
+
palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
|
7804
|
+
ax=nexttile())
|
7805
|
+
figsets(
|
7806
|
+
xangle=45,
|
7807
|
+
title="Variance Inflation Factor(VIF)",
|
7808
|
+
xlabel="Variance Inflation Factor(VIF)",
|
7809
|
+
ylabel="Features",
|
7810
|
+
legend=None,
|
7811
|
+
ax=ax_vif
|
7812
|
+
)
|
7813
|
+
|
7814
|
+
# Correlation heatmap for numeric columns with high correlation pairs
|
7815
|
+
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7816
|
+
corr = data.select_dtypes(include=[np.number]).dropna().corr()
|
7817
|
+
if corr.shape[1]<=33:
|
7818
|
+
mask = np.triu(np.ones_like(corr, dtype=bool))
|
7819
|
+
# Dynamically scale fontsize based on the number of columns
|
7820
|
+
num_columns = corr.shape[1]
|
7821
|
+
fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
|
7822
|
+
|
7823
|
+
ax_heatmap=sns.heatmap(
|
7824
|
+
corr,
|
7825
|
+
mask=mask,
|
7826
|
+
annot=True,
|
7827
|
+
cmap="coolwarm",
|
7828
|
+
center=0,
|
7829
|
+
fmt=".2f",
|
7830
|
+
linewidths=0.5,
|
7831
|
+
vmin=-1, vmax=1,
|
7832
|
+
ax=nexttile(2, 2),
|
7833
|
+
cbar_kws=dict(shrink=0.2,ticks=np.arange(-1, 2, 1)),
|
7834
|
+
annot_kws={"size": fontsize}
|
7835
|
+
)
|
7836
|
+
|
7837
|
+
figsets(
|
7838
|
+
xangle=45,
|
7839
|
+
title="Correlation Heatmap",
|
7840
|
+
ax=ax_heatmap
|
7841
|
+
)
|
7130
7842
|
|
7131
7843
|
def use_pd(
|
7132
7844
|
func_name="excel",
|