py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/index +0 -0
- py2ls/ips.py +724 -12
- py2ls/ml2ls copy.py +2906 -0
- py2ls/ml2ls.py +411 -16
- py2ls/plot.py +409 -24
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/METADATA +2 -1
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/RECORD +8 -7
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -2171,6 +2171,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
2171
2171
|
continue
|
2172
2172
|
else:
|
2173
2173
|
pass
|
2174
|
+
if is_df_abnormal(df,verbose=verbose):
|
2175
|
+
df=pd.read_csv(fpath,**kwargs)
|
2174
2176
|
display(df.head(2))
|
2175
2177
|
print(f"shape: {df.shape}")
|
2176
2178
|
return df
|
@@ -3163,14 +3165,19 @@ def listdir(
|
|
3163
3165
|
if kind is None:
|
3164
3166
|
ls = os.listdir(rootdir)
|
3165
3167
|
ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
|
3166
|
-
|
3168
|
+
if verbose:
|
3169
|
+
if len(ls)>20:
|
3170
|
+
print(ls[:20])
|
3171
|
+
else:
|
3172
|
+
print(ls)
|
3167
3173
|
df_all = pd.DataFrame(
|
3168
3174
|
{
|
3169
3175
|
"fname": ls,
|
3170
3176
|
"fpath": [os.path.join(rootdir, i) for i in ls],
|
3171
3177
|
}
|
3172
3178
|
)
|
3173
|
-
|
3179
|
+
if verbose:
|
3180
|
+
display(df_all.head())
|
3174
3181
|
return df_all
|
3175
3182
|
if isinstance(kind, list):
|
3176
3183
|
f_ = []
|
@@ -3206,6 +3213,7 @@ def listdir(
|
|
3206
3213
|
"size": [],
|
3207
3214
|
"fname": [],
|
3208
3215
|
"fpath": [],
|
3216
|
+
"basename":[],
|
3209
3217
|
}
|
3210
3218
|
for item in ls:
|
3211
3219
|
item_path = os.path.join(rootdir, item)
|
@@ -3228,6 +3236,7 @@ def listdir(
|
|
3228
3236
|
f["length"].append(len(filename))
|
3229
3237
|
f["path"].append(os.path.join(os.path.dirname(item_path), item))
|
3230
3238
|
fpath = os.path.join(os.path.dirname(item_path), item)
|
3239
|
+
basename=os.path.basename(item_path)
|
3231
3240
|
f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
|
3232
3241
|
f["created_time"].append(
|
3233
3242
|
pd.to_datetime(os.path.getctime(item_path), unit="s")
|
@@ -3240,6 +3249,7 @@ def listdir(
|
|
3240
3249
|
)
|
3241
3250
|
f["fname"].append(filename) # will be removed
|
3242
3251
|
f["fpath"].append(fpath) # will be removed
|
3252
|
+
f['basename'].append(basename)
|
3243
3253
|
i += 1
|
3244
3254
|
|
3245
3255
|
f["num"] = i
|
@@ -3462,7 +3472,6 @@ def figsave(*args, dpi=300):
|
|
3462
3472
|
img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
|
3463
3473
|
elif isinstance(img, np.ndarray):
|
3464
3474
|
import cv2
|
3465
|
-
|
3466
3475
|
# Check the shape of the image to determine color mode
|
3467
3476
|
if img.ndim == 2:
|
3468
3477
|
# Grayscale image
|
@@ -5055,16 +5064,22 @@ def _df_outlier(
|
|
5055
5064
|
from scipy.stats import zscore
|
5056
5065
|
from sklearn.ensemble import IsolationForest
|
5057
5066
|
from sklearn.preprocessing import StandardScaler
|
5058
|
-
|
5067
|
+
|
5068
|
+
# Fill completely NaN columns with a default value (e.g., 0)
|
5069
|
+
data = data.copy()
|
5070
|
+
data.loc[:, data.isna().all()] = 0
|
5071
|
+
if columns is not None:
|
5072
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
5073
|
+
data=data[columns]
|
5059
5074
|
col_names_org = data.columns.tolist()
|
5060
5075
|
index_names_org = data.index.tolist()
|
5061
5076
|
# Separate numeric and non-numeric columns
|
5062
5077
|
numeric_data = data.select_dtypes(include=[np.number])
|
5063
5078
|
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
5064
5079
|
|
5065
|
-
if columns is not None:
|
5066
|
-
|
5067
|
-
|
5080
|
+
# if columns is not None:
|
5081
|
+
# numeric_data = numeric_data[columns]
|
5082
|
+
if numeric_data.empty:
|
5068
5083
|
raise ValueError("Input data must contain numeric columns.")
|
5069
5084
|
|
5070
5085
|
outliers_df = pd.DataFrame(index=numeric_data.index)
|
@@ -5626,6 +5641,10 @@ def df_fillna(
|
|
5626
5641
|
for col in data.columns:
|
5627
5642
|
data[col] = data[col].apply(lambda x: np.nan if x is None else x)
|
5628
5643
|
|
5644
|
+
# Fill completely NaN columns with a default value (e.g., 0)
|
5645
|
+
data = data.copy()
|
5646
|
+
data.loc[:, data.isna().all()] = 0
|
5647
|
+
|
5629
5648
|
col_names_org = data.columns.tolist()
|
5630
5649
|
index_names_org = data.index.tolist()
|
5631
5650
|
# Separate numeric and non-numeric columns
|
@@ -5682,11 +5701,11 @@ def df_fillna(
|
|
5682
5701
|
imputed_data = imputer.fit_transform(numeric_data.T)
|
5683
5702
|
else:
|
5684
5703
|
raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
|
5685
|
-
|
5704
|
+
|
5686
5705
|
imputed_data = pd.DataFrame(
|
5687
5706
|
imputed_data if axis == 0 else imputed_data.T,
|
5688
|
-
index=numeric_data.index if axis == 0 else
|
5689
|
-
columns=numeric_data.columns if axis == 0 else
|
5707
|
+
index=numeric_data.index if axis == 0 else numeric_data.columns,
|
5708
|
+
columns=numeric_data.columns if axis == 0 else numeric_data.index,
|
5690
5709
|
)
|
5691
5710
|
for col in imputed_data.select_dtypes(include=[np.number]).columns:
|
5692
5711
|
imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
|
@@ -5826,8 +5845,13 @@ def df_encoder(
|
|
5826
5845
|
from sklearn.preprocessing import LabelEncoder
|
5827
5846
|
|
5828
5847
|
encoder = LabelEncoder()
|
5829
|
-
|
5830
|
-
|
5848
|
+
# Apply LabelEncoder only to non-numeric columns
|
5849
|
+
non_numeric_columns = [col for col in columns if not pd.api.types.is_numeric_dtype(data[col])]
|
5850
|
+
|
5851
|
+
if not non_numeric_columns:
|
5852
|
+
return data
|
5853
|
+
encoded_data = data[non_numeric_columns].apply(lambda col: encoder.fit_transform(col))
|
5854
|
+
return pd.concat([data.drop(non_numeric_columns, axis=1), encoded_data], axis=1)
|
5831
5855
|
|
5832
5856
|
# Target encoding (Mean of the target for each category)
|
5833
5857
|
elif method == "target":
|
@@ -6878,7 +6902,188 @@ def df_reducer(
|
|
6878
6902
|
# example:
|
6879
6903
|
# df_reducer(data=data_log, columns=markers, n_components=2)
|
6880
6904
|
|
6905
|
+
def df_format(data, threshold_unique=0.5, verbose=False):
|
6906
|
+
"""
|
6907
|
+
检测表格: long, wide or uncertain.
|
6908
|
+
|
6909
|
+
Parameters:
|
6910
|
+
- data (pd.DataFrame): DataFrame to check.
|
6911
|
+
- threshold_unique (float): Proportion threshold for detecting categorical columns.
|
6912
|
+
|
6913
|
+
Returns:
|
6914
|
+
- "long" if detected as long format,
|
6915
|
+
- "wide" if detected as wide format
|
6916
|
+
- "uncertain" if ambiguous.
|
6917
|
+
"""
|
6918
|
+
from scipy.stats import entropy
|
6919
|
+
from sklearn.cluster import AgglomerativeClustering
|
6920
|
+
from sklearn.preprocessing import StandardScaler
|
6921
|
+
|
6922
|
+
long_score = 0
|
6923
|
+
wide_score = 0
|
6924
|
+
|
6925
|
+
n_rows, n_cols = data.shape
|
6926
|
+
|
6927
|
+
# Step 1: Row-Column Ratio Heuristic
|
6928
|
+
if n_rows > 3 * n_cols:
|
6929
|
+
long_score += 2
|
6930
|
+
if verbose:
|
6931
|
+
print(
|
6932
|
+
"Row-Column Ratio suggests long format (many rows relative to columns)."
|
6933
|
+
)
|
6934
|
+
elif n_cols > 3 * n_rows:
|
6935
|
+
wide_score += 2
|
6936
|
+
if verbose:
|
6937
|
+
print(
|
6938
|
+
"Row-Column Ratio suggests wide format (many columns relative to rows)."
|
6939
|
+
)
|
6940
|
+
|
6941
|
+
# Step 2: Unique-to-duplicate ratio and entropy for categorical variables
|
6942
|
+
unique_counts = data.apply(lambda x: x.nunique())
|
6943
|
+
duplicate_ratio = 1 - unique_counts / n_rows
|
6944
|
+
if (duplicate_ratio > 0.2).sum() > 0.5 * n_cols:
|
6945
|
+
wide_score += 2
|
6946
|
+
if verbose:
|
6947
|
+
print("High duplicate values in columns suggest wide format.")
|
6948
|
+
else:
|
6949
|
+
long_score += 1
|
6950
|
+
if verbose:
|
6951
|
+
print(
|
6952
|
+
"Lower duplicate ratio suggests long format (higher row variability)."
|
6953
|
+
)
|
6881
6954
|
|
6955
|
+
# Calculate entropy for categorical columns
|
6956
|
+
categorical_cols = data.select_dtypes(include=["object", "category"]).columns
|
6957
|
+
if len(categorical_cols) > 0:
|
6958
|
+
for col in categorical_cols:
|
6959
|
+
counts = data[col].value_counts(normalize=True)
|
6960
|
+
col_entropy = entropy(counts)
|
6961
|
+
if col_entropy < 1.5:
|
6962
|
+
long_score += 1
|
6963
|
+
if verbose:
|
6964
|
+
print(
|
6965
|
+
f"Column '{col}' entropy suggests categorical, supporting long format."
|
6966
|
+
)
|
6967
|
+
else:
|
6968
|
+
wide_score += 1
|
6969
|
+
if verbose:
|
6970
|
+
print(f"Column '{col}' entropy is higher, supporting wide format.")
|
6971
|
+
|
6972
|
+
# Step 3: Column grouping analysis for patterns in suffixes/prefixes
|
6973
|
+
col_names = data.columns.astype(str)
|
6974
|
+
suffix_count = sum("_" in col or col[-1].isdigit() for col in col_names)
|
6975
|
+
if suffix_count > 0.3 * n_cols:
|
6976
|
+
wide_score += 2
|
6977
|
+
if verbose:
|
6978
|
+
print(
|
6979
|
+
"Detected suffix/prefix patterns in column names, suggesting wide format."
|
6980
|
+
)
|
6981
|
+
|
6982
|
+
# Step 4: Entity identifier detection for long format with categorical columns
|
6983
|
+
if len(categorical_cols) > 0 and n_rows > n_cols:
|
6984
|
+
entity_identifier_count = sum(
|
6985
|
+
data.duplicated(subset=categorical_cols, keep=False)
|
6986
|
+
)
|
6987
|
+
if entity_identifier_count > 0.2 * n_rows:
|
6988
|
+
long_score += 2
|
6989
|
+
if verbose:
|
6990
|
+
print(
|
6991
|
+
"Significant duplicate rows based on categorical columns, suggesting long format."
|
6992
|
+
)
|
6993
|
+
|
6994
|
+
# Step 5: Clustering analysis on numerical columns for correlation in wide format
|
6995
|
+
numeric_cols = data.select_dtypes(include="number").columns
|
6996
|
+
if len(numeric_cols) > 1:
|
6997
|
+
scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
|
6998
|
+
clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
|
6999
|
+
cluster_labels = pd.Series(clustering.labels_)
|
7000
|
+
if cluster_labels.nunique() < len(numeric_cols) * 0.5:
|
7001
|
+
wide_score += 2
|
7002
|
+
if verbose:
|
7003
|
+
print("Clustering on columns shows grouping, suggesting wide format.")
|
7004
|
+
|
7005
|
+
# Step 6: Inter-column correlation analysis
|
7006
|
+
if len(numeric_cols) > 1:
|
7007
|
+
corr_matrix = data[numeric_cols].corr().abs()
|
7008
|
+
avg_corr = (
|
7009
|
+
corr_matrix.where(~np.eye(len(corr_matrix), dtype=bool)).mean().mean()
|
7010
|
+
)
|
7011
|
+
if avg_corr > 0.6:
|
7012
|
+
wide_score += 2
|
7013
|
+
if verbose:
|
7014
|
+
print("High inter-column correlation suggests wide format.")
|
7015
|
+
|
7016
|
+
# Step 7: Missing value pattern analysis
|
7017
|
+
missing_patterns = data.isna().sum(axis=1)
|
7018
|
+
if missing_patterns.std() < 2:
|
7019
|
+
wide_score += 1
|
7020
|
+
if verbose:
|
7021
|
+
print(
|
7022
|
+
"Low variation in missing patterns across rows, supporting wide format."
|
7023
|
+
)
|
7024
|
+
elif missing_patterns.mean() < 1:
|
7025
|
+
long_score += 1
|
7026
|
+
if verbose:
|
7027
|
+
print("Lower missing pattern suggests long format (less structured).")
|
7028
|
+
|
7029
|
+
# Step 8: Multi-level clustering on rows to detect block structure for wide format
|
7030
|
+
if len(numeric_cols) > 1 and n_rows > 5:
|
7031
|
+
clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
|
7032
|
+
if pd.Series(clustering_rows.labels_).nunique() < 2:
|
7033
|
+
wide_score += 2
|
7034
|
+
if verbose:
|
7035
|
+
print("Row clustering reveals homogeneity, suggesting wide format.")
|
7036
|
+
|
7037
|
+
# Step 9: Sequential name detection for time-series pattern in wide format
|
7038
|
+
if any(col.isdigit() or col.startswith("T") for col in col_names):
|
7039
|
+
wide_score += 1
|
7040
|
+
if verbose:
|
7041
|
+
print("Detected time-like sequential column names, supporting wide format.")
|
7042
|
+
|
7043
|
+
# Step 10: Entropy of numeric columns
|
7044
|
+
numeric_entropy = data[numeric_cols].apply(
|
7045
|
+
lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
|
7046
|
+
)
|
7047
|
+
if numeric_entropy.mean() < 2:
|
7048
|
+
wide_score += 2
|
7049
|
+
if verbose:
|
7050
|
+
print(
|
7051
|
+
"Low entropy in numeric columns indicates stability across columns, supporting wide format."
|
7052
|
+
)
|
7053
|
+
|
7054
|
+
# Step 11: Tie-breaking strategy if scores are equal
|
7055
|
+
if wide_score == long_score:
|
7056
|
+
if n_cols > n_rows:
|
7057
|
+
wide_score += 1
|
7058
|
+
if verbose:
|
7059
|
+
print(
|
7060
|
+
"Tie-breaking based on column-major structure, favoring wide format."
|
7061
|
+
)
|
7062
|
+
elif n_rows > n_cols:
|
7063
|
+
long_score += 1
|
7064
|
+
if verbose:
|
7065
|
+
print(
|
7066
|
+
"Tie-breaking based on row-major structure, favoring long format."
|
7067
|
+
)
|
7068
|
+
else:
|
7069
|
+
if verbose:
|
7070
|
+
print("Tie-breaking inconclusive; returning 'uncertain'.")
|
7071
|
+
return "uncertain"
|
7072
|
+
|
7073
|
+
# Final decision
|
7074
|
+
if wide_score > long_score:
|
7075
|
+
if verbose:
|
7076
|
+
print("Final decision: Wide format.")
|
7077
|
+
return "wide"
|
7078
|
+
elif long_score > wide_score:
|
7079
|
+
if verbose:
|
7080
|
+
print("Final decision: Long format.")
|
7081
|
+
return "long"
|
7082
|
+
else:
|
7083
|
+
if verbose:
|
7084
|
+
print("Final decision: Uncertain format.")
|
7085
|
+
return "uncertain"
|
7086
|
+
|
6882
7087
|
def plot_cluster(
|
6883
7088
|
data: pd.DataFrame,
|
6884
7089
|
labels: np.ndarray,
|
@@ -7126,7 +7331,514 @@ def evaluate_cluster(
|
|
7126
7331
|
metrics["V-Measure"] = np.nan
|
7127
7332
|
|
7128
7333
|
return metrics
|
7334
|
+
def df_qc(
|
7335
|
+
data: pd.DataFrame,
|
7336
|
+
columns=None,
|
7337
|
+
verbose=False,
|
7338
|
+
plot_=True,
|
7339
|
+
max_cols=20, # only for plots
|
7340
|
+
output=False,
|
7341
|
+
):
|
7342
|
+
"""
|
7343
|
+
Usage example:
|
7344
|
+
df = pd.DataFrame(...) # Your DataFrameres_qc = df_qc(df)
|
7345
|
+
"""
|
7346
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
7347
|
+
from scipy.stats import skew, kurtosis, entropy
|
7348
|
+
import skimpy
|
7349
|
+
|
7350
|
+
#! display(data.select_dtypes(include=[np.number]).describe())
|
7351
|
+
#!skim
|
7352
|
+
if columns is not None:
|
7353
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7354
|
+
data=data[columns]
|
7355
|
+
try:
|
7356
|
+
skimpy.skim(data)
|
7357
|
+
except:
|
7358
|
+
numerical_data = data.select_dtypes(include=[np.number])
|
7359
|
+
skimpy.skim(numerical_data)
|
7360
|
+
# Fill completely NaN columns with a default value (e.g., 0)
|
7361
|
+
data = data.copy()
|
7362
|
+
data.loc[:, data.isna().all()] = 0
|
7363
|
+
res_qc = {}
|
7364
|
+
|
7365
|
+
# Missing values
|
7366
|
+
res_qc["missing_values"] = data.isnull().sum()
|
7367
|
+
res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
|
7368
|
+
res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
|
7369
|
+
|
7370
|
+
# Data types and unique values
|
7371
|
+
res_qc["data_types"] = data.dtypes
|
7372
|
+
res_qc["unique_values"] = data.nunique()
|
7373
|
+
res_qc["constant_columns"] = [
|
7374
|
+
col for col in data.columns if data[col].nunique() <= 1
|
7375
|
+
]
|
7376
|
+
|
7377
|
+
# Duplicate rows and columns
|
7378
|
+
res_qc["duplicate_rows"] = data.duplicated().sum()
|
7379
|
+
res_qc["duplicate_columns"] = data.columns[data.columns.duplicated()].tolist()
|
7380
|
+
|
7381
|
+
# Empty columns
|
7382
|
+
res_qc["empty_columns"] = [col for col in data.columns if data[col].isnull().all()]
|
7383
|
+
|
7384
|
+
# outliers
|
7385
|
+
data_outliers = df_outlier(data)
|
7386
|
+
outlier_num = data_outliers.isna().sum() - data.isnull().sum()
|
7387
|
+
res_qc["outlier_num"] = outlier_num[outlier_num > 0]
|
7388
|
+
outlier_percentage=(outlier_num / len(data_outliers)) * 100
|
7389
|
+
res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
|
7390
|
+
# Correlation and multicollinearity (VIF)
|
7391
|
+
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7392
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7393
|
+
corr_matrix = numeric_df.corr()
|
7394
|
+
high_corr_pairs = [
|
7395
|
+
(col1, col2)
|
7396
|
+
for col1 in corr_matrix.columns
|
7397
|
+
for col2 in corr_matrix.columns
|
7398
|
+
if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
|
7399
|
+
]
|
7400
|
+
res_qc["high_correlations"] = high_corr_pairs
|
7401
|
+
|
7402
|
+
# VIF for multicollinearity check
|
7403
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7404
|
+
vif_data = pd.DataFrame()
|
7405
|
+
res_qc["vif"]=vif_data
|
7406
|
+
if numeric_df.shape[1] > 1:
|
7407
|
+
vif_data["feature"] = numeric_df.columns
|
7408
|
+
vif_data["VIF"] = [
|
7409
|
+
variance_inflation_factor(numeric_df.values, i)
|
7410
|
+
for i in range(numeric_df.shape[1])
|
7411
|
+
]
|
7412
|
+
res_qc["vif"] = vif_data[
|
7413
|
+
vif_data["VIF"] > 5
|
7414
|
+
] # Typically VIF > 5 indicates multicollinearity
|
7415
|
+
# Skewness and Kurtosis
|
7416
|
+
skewness = data.skew(numeric_only=True)
|
7417
|
+
kurtosis_vals = data.kurt(numeric_only=True)
|
7418
|
+
res_qc["skewness"] = skewness[abs(skewness) > 1]
|
7419
|
+
res_qc["kurtosis"] = kurtosis_vals[abs(kurtosis_vals) > 3]
|
7420
|
+
|
7421
|
+
# Entropy for categorical columns (higher entropy suggests more disorder)
|
7422
|
+
categorical_cols = data.select_dtypes(include=["object", "category"]).columns
|
7423
|
+
res_qc["entropy_categoricals"] = {
|
7424
|
+
col: entropy(data[col].value_counts(normalize=True), base=2)
|
7425
|
+
for col in categorical_cols
|
7426
|
+
}
|
7427
|
+
# number of unique
|
7428
|
+
res_qc["unique_counts"] = data.nunique()
|
7429
|
+
# dtypes counts
|
7430
|
+
res_qc['dtype_counts']=data.dtypes.value_counts()
|
7431
|
+
|
7432
|
+
# Distribution Analysis (mean, median, mode, std dev, IQR for numeric columns)
|
7433
|
+
distribution_stats = data.select_dtypes(include=[np.number]).describe().T
|
7434
|
+
iqr = data.select_dtypes(include=[np.number]).apply(
|
7435
|
+
lambda x: x.quantile(0.75) - x.quantile(0.25)
|
7436
|
+
)
|
7437
|
+
distribution_stats["IQR"] = iqr
|
7438
|
+
res_qc["distribution_analysis"] = distribution_stats
|
7439
|
+
|
7440
|
+
# Variance Check: Identify low-variance columns
|
7441
|
+
variance_threshold = 0.01
|
7442
|
+
low_variance_cols = [
|
7443
|
+
col
|
7444
|
+
for col in data.select_dtypes(include=[np.number]).columns
|
7445
|
+
if data[col].var() < variance_threshold
|
7446
|
+
]
|
7447
|
+
res_qc["low_variance_features"] = low_variance_cols
|
7448
|
+
|
7449
|
+
# Categorical columns and cardinality
|
7450
|
+
categorical_cols = data.select_dtypes(include=["object", "category"]).columns
|
7451
|
+
high_cardinality = {
|
7452
|
+
col: data[col].nunique() for col in categorical_cols if data[col].nunique() > 50
|
7453
|
+
}
|
7454
|
+
res_qc["high_cardinality_categoricals"] = high_cardinality
|
7455
|
+
|
7456
|
+
# Feature-type inconsistency (mixed types in columns)
|
7457
|
+
inconsistent_types = {}
|
7458
|
+
for col in data.columns:
|
7459
|
+
unique_types = set(type(val) for val in data[col].dropna())
|
7460
|
+
if len(unique_types) > 1:
|
7461
|
+
inconsistent_types[col] = unique_types
|
7462
|
+
res_qc["inconsistent_types"] = inconsistent_types
|
7463
|
+
|
7464
|
+
|
7465
|
+
# Text length analysis for text fields
|
7466
|
+
text_lengths = {}
|
7467
|
+
for col in categorical_cols:
|
7468
|
+
text_lengths[col] = {
|
7469
|
+
"avg_length": data[col].dropna().apply(len).mean(),
|
7470
|
+
"length_variance": data[col].dropna().apply(len).var(),
|
7471
|
+
}
|
7472
|
+
res_qc["text_length_analysis"] = text_lengths
|
7473
|
+
|
7474
|
+
# Summary statistics
|
7475
|
+
res_qc["summary_statistics"] = data.describe().T
|
7476
|
+
|
7477
|
+
# Automated warnings
|
7478
|
+
warnings = []
|
7479
|
+
if res_qc["duplicate_rows"] > 0:
|
7480
|
+
warnings.append("Warning: Duplicate rows detected.")
|
7481
|
+
if len(res_qc["empty_columns"]) > 0:
|
7482
|
+
warnings.append("Warning: Columns with only NaN values detected.")
|
7483
|
+
if len(res_qc["constant_columns"]) > 0:
|
7484
|
+
warnings.append("Warning: Columns with a single constant value detected.")
|
7485
|
+
if len(high_corr_pairs) > 0:
|
7486
|
+
warnings.append("Warning: Highly correlated columns detected.")
|
7487
|
+
if len(res_qc["vif"]) > 0:
|
7488
|
+
warnings.append("Warning: Multicollinearity detected in features.")
|
7489
|
+
if len(high_cardinality) > 0:
|
7490
|
+
warnings.append("Warning: High cardinality in categorical columns.")
|
7491
|
+
if len(inconsistent_types) > 0:
|
7492
|
+
warnings.append("Warning: Columns with mixed data types detected.")
|
7493
|
+
res_qc["warnings"] = warnings
|
7494
|
+
|
7495
|
+
# Report generation
|
7496
|
+
if verbose:
|
7497
|
+
print("=== QC Report Summary ===")
|
7498
|
+
print("\nMissing Values (Total and %):")
|
7499
|
+
print(res_qc["missing_values"][res_qc["missing_values"] > 0])
|
7500
|
+
print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
|
7501
|
+
|
7502
|
+
print("\nRows with Missing Values:", res_qc["rows_with_missing"])
|
7503
|
+
|
7504
|
+
print("\nData Types:")
|
7505
|
+
print(res_qc["data_types"])
|
7506
|
+
|
7507
|
+
print("\nUnique Values per Column:")
|
7508
|
+
print(res_qc["unique_values"])
|
7509
|
+
|
7510
|
+
print("\nConstant Columns:", res_qc["constant_columns"])
|
7511
|
+
|
7512
|
+
print("\nDuplicate Rows:", res_qc["duplicate_rows"])
|
7513
|
+
print("Duplicate Columns:", res_qc["duplicate_columns"])
|
7514
|
+
|
7515
|
+
if res_qc["empty_columns"]:
|
7516
|
+
print("\nEmpty Columns:", res_qc["empty_columns"])
|
7517
|
+
|
7518
|
+
print("\nOutlier Report:")
|
7519
|
+
print(res_qc["outlier_num"])
|
7520
|
+
print("\nPercentage of Values Replaced per Column:")
|
7521
|
+
print(res_qc["outlier_percentage"])
|
7522
|
+
|
7523
|
+
print("\nHigh Correlations (>|0.9|):")
|
7524
|
+
for col1, col2 in res_qc["high_correlations"]:
|
7525
|
+
print(f" {col1} and {col2}")
|
7526
|
+
|
7527
|
+
if "vif" in res_qc:
|
7528
|
+
print("\nFeatures with High VIF (>|5|):")
|
7529
|
+
print(res_qc["vif"])
|
7530
|
+
|
7531
|
+
print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
|
7532
|
+
print(res_qc["high_cardinality_categoricals"])
|
7533
|
+
|
7534
|
+
print("\nInconsistent Data Types:")
|
7535
|
+
print(res_qc["inconsistent_types"])
|
7536
|
+
|
7537
|
+
print("\nRange Checks for Numeric Columns:")
|
7538
|
+
print(res_qc["range_checks"])
|
7539
|
+
|
7540
|
+
print("\nText Length Analysis:")
|
7541
|
+
for col, stats in res_qc["text_length_analysis"].items():
|
7542
|
+
print(
|
7543
|
+
f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
|
7544
|
+
)
|
7545
|
+
|
7546
|
+
print("\nSummary Statistics:")
|
7547
|
+
print(res_qc["summary_statistics"])
|
7129
7548
|
|
7549
|
+
if res_qc["warnings"]:
|
7550
|
+
print("\nWarnings:")
|
7551
|
+
for warning in res_qc["warnings"]:
|
7552
|
+
print(" -", warning)
|
7553
|
+
if plot_:
|
7554
|
+
df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
|
7555
|
+
if output:
|
7556
|
+
return res_qc
|
7557
|
+
return None
|
7558
|
+
|
7559
|
+
|
7560
|
+
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
|
7561
|
+
import matplotlib.pyplot as plt
|
7562
|
+
import seaborn as sns
|
7563
|
+
from .plot import subplot, figsets, get_color
|
7564
|
+
|
7565
|
+
if columns is not None:
|
7566
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7567
|
+
data=data[columns]
|
7568
|
+
len_total = len(res_qc)
|
7569
|
+
n_row, n_col = int((len_total + 10) / 3), 3
|
7570
|
+
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
7571
|
+
|
7572
|
+
missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
|
7573
|
+
ascending=False
|
7574
|
+
)
|
7575
|
+
if len(missing_data) > max_cols:
|
7576
|
+
missing_data = missing_data[:max_cols]
|
7577
|
+
ax=sns.barplot(
|
7578
|
+
x=missing_data.index,
|
7579
|
+
y=missing_data.values,
|
7580
|
+
hue=missing_data.index,
|
7581
|
+
palette=get_color(len(missing_data), cmap="Blues")[::-1],
|
7582
|
+
ax=nexttile(),
|
7583
|
+
)
|
7584
|
+
figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
|
7585
|
+
|
7586
|
+
ax2 = ax.twinx()
|
7587
|
+
# Plot missing value percentages
|
7588
|
+
missing_percentage = res_qc["missing_percentage"][
|
7589
|
+
res_qc["missing_percentage"] > 0
|
7590
|
+
].sort_values(ascending=False)
|
7591
|
+
sns.barplot(
|
7592
|
+
x=missing_percentage.index,
|
7593
|
+
y=missing_percentage.values,
|
7594
|
+
hue=missing_percentage.index,
|
7595
|
+
palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
|
7596
|
+
ax=ax2,#nexttile(),
|
7597
|
+
)
|
7598
|
+
figsets(xangle=45, ylabel="%",ax=ax2)
|
7599
|
+
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7600
|
+
ax2.yaxis.label.set_color('r')
|
7601
|
+
|
7602
|
+
outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
|
7603
|
+
if len(outlier_num) > max_cols:
|
7604
|
+
outlier_num = outlier_num[:max_cols]
|
7605
|
+
ax_outlier_num=sns.barplot(
|
7606
|
+
x=outlier_num.index,
|
7607
|
+
y=outlier_num.values,
|
7608
|
+
hue=outlier_num.index,
|
7609
|
+
palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
|
7610
|
+
ax=nexttile(),
|
7611
|
+
)
|
7612
|
+
figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
|
7613
|
+
ax_outlier_percentage = ax_outlier_num.twinx()
|
7614
|
+
outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
|
7615
|
+
if len(outlier_percentage) > max_cols:
|
7616
|
+
outlier_percentage = outlier_percentage[:max_cols]
|
7617
|
+
ax_outlier_percentage=sns.barplot(
|
7618
|
+
x=outlier_percentage.index,
|
7619
|
+
y=outlier_percentage.values,
|
7620
|
+
hue=outlier_percentage.index,
|
7621
|
+
palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
|
7622
|
+
ax=ax2 #nexttile(),
|
7623
|
+
)
|
7624
|
+
figsets(
|
7625
|
+
xangle=45,
|
7626
|
+
ylabel="%",
|
7627
|
+
xlabel=None,
|
7628
|
+
ylim=[0, outlier_percentage.max() + 2],
|
7629
|
+
ax=ax_outlier_percentage
|
7630
|
+
)
|
7631
|
+
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7632
|
+
ax2.yaxis.label.set_color('r')
|
7633
|
+
|
7634
|
+
# Skewness and Kurtosis Plots
|
7635
|
+
skewness = res_qc["skewness"].sort_values(ascending=False)
|
7636
|
+
kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
|
7637
|
+
if not skewness.empty:
|
7638
|
+
ax_skewness=sns.barplot(
|
7639
|
+
x=skewness.index,
|
7640
|
+
y=skewness.values,
|
7641
|
+
hue=skewness.index,
|
7642
|
+
palette=get_color(len(skewness), cmap="coolwarm")[::-1],
|
7643
|
+
ax=nexttile(),
|
7644
|
+
)
|
7645
|
+
figsets(
|
7646
|
+
xangle=45,
|
7647
|
+
title="Highly Skewed Numeric Columns (Skewness > 1)",
|
7648
|
+
ylabel="Skewness",xlabel=None,ax=ax_skewness
|
7649
|
+
)
|
7650
|
+
if not kurtosis.empty:
|
7651
|
+
ax_kurtosis=sns.barplot(
|
7652
|
+
x=kurtosis.index,
|
7653
|
+
y=kurtosis.values,
|
7654
|
+
hue=kurtosis.index,
|
7655
|
+
palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
|
7656
|
+
ax=nexttile(),
|
7657
|
+
)
|
7658
|
+
figsets(
|
7659
|
+
xangle=45,
|
7660
|
+
title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
|
7661
|
+
ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
|
7662
|
+
)
|
7663
|
+
|
7664
|
+
# Entropy for Categorical Variables
|
7665
|
+
entropy_data = pd.Series(res_qc["entropy_categoricals"]).sort_values(
|
7666
|
+
ascending=False
|
7667
|
+
)
|
7668
|
+
ax_entropy_data=sns.barplot(
|
7669
|
+
x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
|
7670
|
+
)
|
7671
|
+
figsets(
|
7672
|
+
xangle=45,
|
7673
|
+
xlabel="Categorical Columns",
|
7674
|
+
title="Entropy of Categorical Variables",
|
7675
|
+
ylabel="Entropy (bits)",
|
7676
|
+
ax=ax_entropy_data
|
7677
|
+
)
|
7678
|
+
# Distribution Analysis: Boxplot for IQR
|
7679
|
+
ax_iqr=sns.boxplot(
|
7680
|
+
data=data[res_qc["distribution_analysis"].index],
|
7681
|
+
orient="v",
|
7682
|
+
palette="Set3",
|
7683
|
+
ax=nexttile(),
|
7684
|
+
)
|
7685
|
+
figsets(
|
7686
|
+
xangle=45,
|
7687
|
+
title="Range for Numeric Columns",
|
7688
|
+
ylabel="#",
|
7689
|
+
ax=ax_iqr
|
7690
|
+
)
|
7691
|
+
# unique counts
|
7692
|
+
unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
|
7693
|
+
ax_unique_counts_=sns.barplot(
|
7694
|
+
x=unique_counts.index,
|
7695
|
+
y=unique_counts.values,
|
7696
|
+
hue=unique_counts.index,
|
7697
|
+
palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
|
7698
|
+
ax=nexttile())
|
7699
|
+
figsets(
|
7700
|
+
xangle=45,
|
7701
|
+
title="Unique Counts",
|
7702
|
+
xlabel=None,
|
7703
|
+
ylabel="#",
|
7704
|
+
ax=ax_unique_counts_
|
7705
|
+
)
|
7706
|
+
# Binary Checking
|
7707
|
+
ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
|
7708
|
+
y=unique_counts[unique_counts<10].values,
|
7709
|
+
hue=unique_counts[unique_counts<10].index,
|
7710
|
+
palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
|
7711
|
+
ax=nexttile())
|
7712
|
+
plt.axhline(y=2, color="r", linestyle="--", lw=2)
|
7713
|
+
figsets(
|
7714
|
+
xangle=45,
|
7715
|
+
xlabel=None,
|
7716
|
+
title="Binary Checking",
|
7717
|
+
ylabel="#",
|
7718
|
+
ax=ax_unique_counts
|
7719
|
+
)
|
7720
|
+
|
7721
|
+
# dtypes counts
|
7722
|
+
dtype_counts = res_qc['dtype_counts']
|
7723
|
+
txt = []
|
7724
|
+
for tp in dtype_counts.index:
|
7725
|
+
txt.append(list(data.select_dtypes(include=tp).columns))
|
7726
|
+
|
7727
|
+
ax_dtype_counts = sns.barplot(
|
7728
|
+
x=dtype_counts.index,
|
7729
|
+
y=dtype_counts.values,
|
7730
|
+
color="#F3C8B2",
|
7731
|
+
ax=nexttile(),
|
7732
|
+
)
|
7733
|
+
max_columns_per_row = 1 # Maximum number of columns per row
|
7734
|
+
for i, tp in enumerate(dtype_counts.index):
|
7735
|
+
if i<=20:
|
7736
|
+
column_names = txt[i]
|
7737
|
+
# Split the column names into multiple lines if too long
|
7738
|
+
column_name_str = ", ".join(column_names)
|
7739
|
+
if len(column_name_str) > 40: # If column names are too long, split them
|
7740
|
+
column_name_str = "\n".join(
|
7741
|
+
[
|
7742
|
+
", ".join(column_names[j : j + max_columns_per_row])
|
7743
|
+
for j in range(0, len(column_names), max_columns_per_row)
|
7744
|
+
]
|
7745
|
+
)
|
7746
|
+
# Place text annotation with line breaks and rotate the text if needed
|
7747
|
+
ax_dtype_counts.text(
|
7748
|
+
i,
|
7749
|
+
dtype_counts.values[i],
|
7750
|
+
f"{column_name_str}",
|
7751
|
+
ha="center",
|
7752
|
+
va="top",
|
7753
|
+
c="k",
|
7754
|
+
fontsize=8,
|
7755
|
+
rotation=0,
|
7756
|
+
)
|
7757
|
+
figsets(
|
7758
|
+
xlabel=None,
|
7759
|
+
title="Dtypes",
|
7760
|
+
ylabel="#",
|
7761
|
+
ax=ax_dtype_counts
|
7762
|
+
)
|
7763
|
+
|
7764
|
+
# High cardinality: Show top categorical columns by unique value count
|
7765
|
+
high_cardinality = res_qc["high_cardinality_categoricals"]
|
7766
|
+
if high_cardinality and len(high_cardinality) > max_cols:
|
7767
|
+
high_cardinality = dict(
|
7768
|
+
sorted(high_cardinality.items(), key=lambda x: x[1], reverse=True)[
|
7769
|
+
:max_cols
|
7770
|
+
]
|
7771
|
+
)
|
7772
|
+
|
7773
|
+
if high_cardinality:
|
7774
|
+
ax_high_cardinality=sns.barplot(
|
7775
|
+
x=list(high_cardinality.keys()),
|
7776
|
+
y=list(high_cardinality.values()),
|
7777
|
+
hue=list(high_cardinality.keys()),
|
7778
|
+
palette="Oranges", ax=nexttile()
|
7779
|
+
)
|
7780
|
+
figsets(
|
7781
|
+
xangle=45,
|
7782
|
+
title="High Cardinality Categorical Columns",
|
7783
|
+
ylabel="Unique Value Count",
|
7784
|
+
ax=ax_high_cardinality
|
7785
|
+
)
|
7786
|
+
if res_qc["low_variance_features"]:
|
7787
|
+
low_variance_data = data[res_qc["low_variance_features"]].copy()
|
7788
|
+
for col in low_variance_data.columns:
|
7789
|
+
sns.histplot(
|
7790
|
+
low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
|
7791
|
+
)
|
7792
|
+
plt.title(f"Low Variance Feature: {col}")
|
7793
|
+
|
7794
|
+
# VIF plot for multicollinearity detection
|
7795
|
+
if "vif" in res_qc and not res_qc["vif"].empty:
|
7796
|
+
vif_data = res_qc["vif"].sort_values(by="VIF", ascending=False)
|
7797
|
+
if len(vif_data) > max_cols:
|
7798
|
+
vif_data = vif_data[:max_cols]
|
7799
|
+
ax_vif=sns.barplot(data=vif_data,
|
7800
|
+
x="VIF",
|
7801
|
+
y="feature",
|
7802
|
+
hue="VIF",
|
7803
|
+
palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
|
7804
|
+
ax=nexttile())
|
7805
|
+
figsets(
|
7806
|
+
xangle=45,
|
7807
|
+
title="Variance Inflation Factor(VIF)",
|
7808
|
+
xlabel="Variance Inflation Factor(VIF)",
|
7809
|
+
ylabel="Features",
|
7810
|
+
legend=None,
|
7811
|
+
ax=ax_vif
|
7812
|
+
)
|
7813
|
+
|
7814
|
+
# Correlation heatmap for numeric columns with high correlation pairs
|
7815
|
+
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7816
|
+
corr = data.select_dtypes(include=[np.number]).dropna().corr()
|
7817
|
+
if corr.shape[1]<=33:
|
7818
|
+
mask = np.triu(np.ones_like(corr, dtype=bool))
|
7819
|
+
# Dynamically scale fontsize based on the number of columns
|
7820
|
+
num_columns = corr.shape[1]
|
7821
|
+
fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
|
7822
|
+
|
7823
|
+
ax_heatmap=sns.heatmap(
|
7824
|
+
corr,
|
7825
|
+
mask=mask,
|
7826
|
+
annot=True,
|
7827
|
+
cmap="coolwarm",
|
7828
|
+
center=0,
|
7829
|
+
fmt=".2f",
|
7830
|
+
linewidths=0.5,
|
7831
|
+
vmin=-1, vmax=1,
|
7832
|
+
ax=nexttile(2, 2),
|
7833
|
+
cbar_kws=dict(shrink=0.2,ticks=np.arange(-1, 2, 1)),
|
7834
|
+
annot_kws={"size": fontsize}
|
7835
|
+
)
|
7836
|
+
|
7837
|
+
figsets(
|
7838
|
+
xangle=45,
|
7839
|
+
title="Correlation Heatmap",
|
7840
|
+
ax=ax_heatmap
|
7841
|
+
)
|
7130
7842
|
|
7131
7843
|
def use_pd(
|
7132
7844
|
func_name="excel",
|