py2ls 0.2.4.14__py3-none-any.whl → 0.2.4.16__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ips.py CHANGED
@@ -3163,14 +3163,19 @@ def listdir(
3163
3163
  if kind is None:
3164
3164
  ls = os.listdir(rootdir)
3165
3165
  ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
3166
- print(ls)
3166
+ if verbose:
3167
+ if len(ls)>20:
3168
+ print(ls[:20])
3169
+ else:
3170
+ print(ls)
3167
3171
  df_all = pd.DataFrame(
3168
3172
  {
3169
3173
  "fname": ls,
3170
3174
  "fpath": [os.path.join(rootdir, i) for i in ls],
3171
3175
  }
3172
3176
  )
3173
- display(df_all)
3177
+ if verbose:
3178
+ display(df_all.head())
3174
3179
  return df_all
3175
3180
  if isinstance(kind, list):
3176
3181
  f_ = []
@@ -3206,6 +3211,7 @@ def listdir(
3206
3211
  "size": [],
3207
3212
  "fname": [],
3208
3213
  "fpath": [],
3214
+ "basename":[],
3209
3215
  }
3210
3216
  for item in ls:
3211
3217
  item_path = os.path.join(rootdir, item)
@@ -3228,6 +3234,7 @@ def listdir(
3228
3234
  f["length"].append(len(filename))
3229
3235
  f["path"].append(os.path.join(os.path.dirname(item_path), item))
3230
3236
  fpath = os.path.join(os.path.dirname(item_path), item)
3237
+ basename=os.path.basename(item_path)
3231
3238
  f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
3232
3239
  f["created_time"].append(
3233
3240
  pd.to_datetime(os.path.getctime(item_path), unit="s")
@@ -3240,6 +3247,7 @@ def listdir(
3240
3247
  )
3241
3248
  f["fname"].append(filename) # will be removed
3242
3249
  f["fpath"].append(fpath) # will be removed
3250
+ f['basename'].append(basename)
3243
3251
  i += 1
3244
3252
 
3245
3253
  f["num"] = i
@@ -3462,7 +3470,6 @@ def figsave(*args, dpi=300):
3462
3470
  img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
3463
3471
  elif isinstance(img, np.ndarray):
3464
3472
  import cv2
3465
-
3466
3473
  # Check the shape of the image to determine color mode
3467
3474
  if img.ndim == 2:
3468
3475
  # Grayscale image
@@ -5055,16 +5062,22 @@ def _df_outlier(
5055
5062
  from scipy.stats import zscore
5056
5063
  from sklearn.ensemble import IsolationForest
5057
5064
  from sklearn.preprocessing import StandardScaler
5058
-
5065
+
5066
+ # Fill completely NaN columns with a default value (e.g., 0)
5067
+ data = data.copy()
5068
+ data.loc[:, data.isna().all()] = 0
5069
+ if columns is not None:
5070
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
5071
+ data=data[columns]
5059
5072
  col_names_org = data.columns.tolist()
5060
5073
  index_names_org = data.index.tolist()
5061
5074
  # Separate numeric and non-numeric columns
5062
5075
  numeric_data = data.select_dtypes(include=[np.number])
5063
5076
  non_numeric_data = data.select_dtypes(exclude=[np.number])
5064
5077
 
5065
- if columns is not None:
5066
- numeric_data = numeric_data[columns]
5067
- elif numeric_data.empty:
5078
+ # if columns is not None:
5079
+ # numeric_data = numeric_data[columns]
5080
+ if numeric_data.empty:
5068
5081
  raise ValueError("Input data must contain numeric columns.")
5069
5082
 
5070
5083
  outliers_df = pd.DataFrame(index=numeric_data.index)
@@ -5626,6 +5639,10 @@ def df_fillna(
5626
5639
  for col in data.columns:
5627
5640
  data[col] = data[col].apply(lambda x: np.nan if x is None else x)
5628
5641
 
5642
+ # Fill completely NaN columns with a default value (e.g., 0)
5643
+ data = data.copy()
5644
+ data.loc[:, data.isna().all()] = 0
5645
+
5629
5646
  col_names_org = data.columns.tolist()
5630
5647
  index_names_org = data.index.tolist()
5631
5648
  # Separate numeric and non-numeric columns
@@ -5682,11 +5699,11 @@ def df_fillna(
5682
5699
  imputed_data = imputer.fit_transform(numeric_data.T)
5683
5700
  else:
5684
5701
  raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
5685
-
5702
+
5686
5703
  imputed_data = pd.DataFrame(
5687
5704
  imputed_data if axis == 0 else imputed_data.T,
5688
- index=numeric_data.index if axis == 0 else data.columns,
5689
- columns=numeric_data.columns if axis == 0 else data.index,
5705
+ index=numeric_data.index if axis == 0 else numeric_data.columns,
5706
+ columns=numeric_data.columns if axis == 0 else numeric_data.index,
5690
5707
  )
5691
5708
  for col in imputed_data.select_dtypes(include=[np.number]).columns:
5692
5709
  imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
@@ -5826,8 +5843,13 @@ def df_encoder(
5826
5843
  from sklearn.preprocessing import LabelEncoder
5827
5844
 
5828
5845
  encoder = LabelEncoder()
5829
- encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
5830
- return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5846
+ # Apply LabelEncoder only to non-numeric columns
5847
+ non_numeric_columns = [col for col in columns if not pd.api.types.is_numeric_dtype(data[col])]
5848
+
5849
+ if not non_numeric_columns:
5850
+ return data
5851
+ encoded_data = data[non_numeric_columns].apply(lambda col: encoder.fit_transform(col))
5852
+ return pd.concat([data.drop(non_numeric_columns, axis=1), encoded_data], axis=1)
5831
5853
 
5832
5854
  # Target encoding (Mean of the target for each category)
5833
5855
  elif method == "target":
@@ -6878,7 +6900,188 @@ def df_reducer(
6878
6900
  # example:
6879
6901
  # df_reducer(data=data_log, columns=markers, n_components=2)
6880
6902
 
6903
+ def df_format(data, threshold_unique=0.5, verbose=False):
6904
+ """
6905
+ 检测表格: long, wide or uncertain.
6906
+
6907
+ Parameters:
6908
+ - data (pd.DataFrame): DataFrame to check.
6909
+ - threshold_unique (float): Proportion threshold for detecting categorical columns.
6910
+
6911
+ Returns:
6912
+ - "long" if detected as long format,
6913
+ - "wide" if detected as wide format
6914
+ - "uncertain" if ambiguous.
6915
+ """
6916
+ from scipy.stats import entropy
6917
+ from sklearn.cluster import AgglomerativeClustering
6918
+ from sklearn.preprocessing import StandardScaler
6919
+
6920
+ long_score = 0
6921
+ wide_score = 0
6922
+
6923
+ n_rows, n_cols = data.shape
6924
+
6925
+ # Step 1: Row-Column Ratio Heuristic
6926
+ if n_rows > 3 * n_cols:
6927
+ long_score += 2
6928
+ if verbose:
6929
+ print(
6930
+ "Row-Column Ratio suggests long format (many rows relative to columns)."
6931
+ )
6932
+ elif n_cols > 3 * n_rows:
6933
+ wide_score += 2
6934
+ if verbose:
6935
+ print(
6936
+ "Row-Column Ratio suggests wide format (many columns relative to rows)."
6937
+ )
6938
+
6939
+ # Step 2: Unique-to-duplicate ratio and entropy for categorical variables
6940
+ unique_counts = data.apply(lambda x: x.nunique())
6941
+ duplicate_ratio = 1 - unique_counts / n_rows
6942
+ if (duplicate_ratio > 0.2).sum() > 0.5 * n_cols:
6943
+ wide_score += 2
6944
+ if verbose:
6945
+ print("High duplicate values in columns suggest wide format.")
6946
+ else:
6947
+ long_score += 1
6948
+ if verbose:
6949
+ print(
6950
+ "Lower duplicate ratio suggests long format (higher row variability)."
6951
+ )
6881
6952
 
6953
+ # Calculate entropy for categorical columns
6954
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
6955
+ if len(categorical_cols) > 0:
6956
+ for col in categorical_cols:
6957
+ counts = data[col].value_counts(normalize=True)
6958
+ col_entropy = entropy(counts)
6959
+ if col_entropy < 1.5:
6960
+ long_score += 1
6961
+ if verbose:
6962
+ print(
6963
+ f"Column '{col}' entropy suggests categorical, supporting long format."
6964
+ )
6965
+ else:
6966
+ wide_score += 1
6967
+ if verbose:
6968
+ print(f"Column '{col}' entropy is higher, supporting wide format.")
6969
+
6970
+ # Step 3: Column grouping analysis for patterns in suffixes/prefixes
6971
+ col_names = data.columns.astype(str)
6972
+ suffix_count = sum("_" in col or col[-1].isdigit() for col in col_names)
6973
+ if suffix_count > 0.3 * n_cols:
6974
+ wide_score += 2
6975
+ if verbose:
6976
+ print(
6977
+ "Detected suffix/prefix patterns in column names, suggesting wide format."
6978
+ )
6979
+
6980
+ # Step 4: Entity identifier detection for long format with categorical columns
6981
+ if len(categorical_cols) > 0 and n_rows > n_cols:
6982
+ entity_identifier_count = sum(
6983
+ data.duplicated(subset=categorical_cols, keep=False)
6984
+ )
6985
+ if entity_identifier_count > 0.2 * n_rows:
6986
+ long_score += 2
6987
+ if verbose:
6988
+ print(
6989
+ "Significant duplicate rows based on categorical columns, suggesting long format."
6990
+ )
6991
+
6992
+ # Step 5: Clustering analysis on numerical columns for correlation in wide format
6993
+ numeric_cols = data.select_dtypes(include="number").columns
6994
+ if len(numeric_cols) > 1:
6995
+ scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
6996
+ clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
6997
+ cluster_labels = pd.Series(clustering.labels_)
6998
+ if cluster_labels.nunique() < len(numeric_cols) * 0.5:
6999
+ wide_score += 2
7000
+ if verbose:
7001
+ print("Clustering on columns shows grouping, suggesting wide format.")
7002
+
7003
+ # Step 6: Inter-column correlation analysis
7004
+ if len(numeric_cols) > 1:
7005
+ corr_matrix = data[numeric_cols].corr().abs()
7006
+ avg_corr = (
7007
+ corr_matrix.where(~np.eye(len(corr_matrix), dtype=bool)).mean().mean()
7008
+ )
7009
+ if avg_corr > 0.6:
7010
+ wide_score += 2
7011
+ if verbose:
7012
+ print("High inter-column correlation suggests wide format.")
7013
+
7014
+ # Step 7: Missing value pattern analysis
7015
+ missing_patterns = data.isna().sum(axis=1)
7016
+ if missing_patterns.std() < 2:
7017
+ wide_score += 1
7018
+ if verbose:
7019
+ print(
7020
+ "Low variation in missing patterns across rows, supporting wide format."
7021
+ )
7022
+ elif missing_patterns.mean() < 1:
7023
+ long_score += 1
7024
+ if verbose:
7025
+ print("Lower missing pattern suggests long format (less structured).")
7026
+
7027
+ # Step 8: Multi-level clustering on rows to detect block structure for wide format
7028
+ if len(numeric_cols) > 1 and n_rows > 5:
7029
+ clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
7030
+ if pd.Series(clustering_rows.labels_).nunique() < 2:
7031
+ wide_score += 2
7032
+ if verbose:
7033
+ print("Row clustering reveals homogeneity, suggesting wide format.")
7034
+
7035
+ # Step 9: Sequential name detection for time-series pattern in wide format
7036
+ if any(col.isdigit() or col.startswith("T") for col in col_names):
7037
+ wide_score += 1
7038
+ if verbose:
7039
+ print("Detected time-like sequential column names, supporting wide format.")
7040
+
7041
+ # Step 10: Entropy of numeric columns
7042
+ numeric_entropy = data[numeric_cols].apply(
7043
+ lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
7044
+ )
7045
+ if numeric_entropy.mean() < 2:
7046
+ wide_score += 2
7047
+ if verbose:
7048
+ print(
7049
+ "Low entropy in numeric columns indicates stability across columns, supporting wide format."
7050
+ )
7051
+
7052
+ # Step 11: Tie-breaking strategy if scores are equal
7053
+ if wide_score == long_score:
7054
+ if n_cols > n_rows:
7055
+ wide_score += 1
7056
+ if verbose:
7057
+ print(
7058
+ "Tie-breaking based on column-major structure, favoring wide format."
7059
+ )
7060
+ elif n_rows > n_cols:
7061
+ long_score += 1
7062
+ if verbose:
7063
+ print(
7064
+ "Tie-breaking based on row-major structure, favoring long format."
7065
+ )
7066
+ else:
7067
+ if verbose:
7068
+ print("Tie-breaking inconclusive; returning 'uncertain'.")
7069
+ return "uncertain"
7070
+
7071
+ # Final decision
7072
+ if wide_score > long_score:
7073
+ if verbose:
7074
+ print("Final decision: Wide format.")
7075
+ return "wide"
7076
+ elif long_score > wide_score:
7077
+ if verbose:
7078
+ print("Final decision: Long format.")
7079
+ return "long"
7080
+ else:
7081
+ if verbose:
7082
+ print("Final decision: Uncertain format.")
7083
+ return "uncertain"
7084
+
6882
7085
  def plot_cluster(
6883
7086
  data: pd.DataFrame,
6884
7087
  labels: np.ndarray,
@@ -7126,7 +7329,514 @@ def evaluate_cluster(
7126
7329
  metrics["V-Measure"] = np.nan
7127
7330
 
7128
7331
  return metrics
7332
+ def df_qc(
7333
+ data: pd.DataFrame,
7334
+ columns=None,
7335
+ verbose=False,
7336
+ plot_=True,
7337
+ max_cols=20, # only for plots
7338
+ output=False,
7339
+ ):
7340
+ """
7341
+ Usage example:
7342
+ df = pd.DataFrame(...) # Your DataFrameres_qc = df_qc(df)
7343
+ """
7344
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
7345
+ from scipy.stats import skew, kurtosis, entropy
7346
+ import skimpy
7347
+
7348
+ #! display(data.select_dtypes(include=[np.number]).describe())
7349
+ #!skim
7350
+ if columns is not None:
7351
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
7352
+ data=data[columns]
7353
+ try:
7354
+ skimpy.skim(data)
7355
+ except:
7356
+ numerical_data = data.select_dtypes(include=[np.number])
7357
+ skimpy.skim(numerical_data)
7358
+ # Fill completely NaN columns with a default value (e.g., 0)
7359
+ data = data.copy()
7360
+ data.loc[:, data.isna().all()] = 0
7361
+ res_qc = {}
7362
+
7363
+ # Missing values
7364
+ res_qc["missing_values"] = data.isnull().sum()
7365
+ res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
7366
+ res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
7367
+
7368
+ # Data types and unique values
7369
+ res_qc["data_types"] = data.dtypes
7370
+ res_qc["unique_values"] = data.nunique()
7371
+ res_qc["constant_columns"] = [
7372
+ col for col in data.columns if data[col].nunique() <= 1
7373
+ ]
7374
+
7375
+ # Duplicate rows and columns
7376
+ res_qc["duplicate_rows"] = data.duplicated().sum()
7377
+ res_qc["duplicate_columns"] = data.columns[data.columns.duplicated()].tolist()
7378
+
7379
+ # Empty columns
7380
+ res_qc["empty_columns"] = [col for col in data.columns if data[col].isnull().all()]
7381
+
7382
+ # outliers
7383
+ data_outliers = df_outlier(data)
7384
+ outlier_num = data_outliers.isna().sum() - data.isnull().sum()
7385
+ res_qc["outlier_num"] = outlier_num[outlier_num > 0]
7386
+ outlier_percentage=(outlier_num / len(data_outliers)) * 100
7387
+ res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
7388
+ # Correlation and multicollinearity (VIF)
7389
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7390
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7391
+ corr_matrix = numeric_df.corr()
7392
+ high_corr_pairs = [
7393
+ (col1, col2)
7394
+ for col1 in corr_matrix.columns
7395
+ for col2 in corr_matrix.columns
7396
+ if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7397
+ ]
7398
+ res_qc["high_correlations"] = high_corr_pairs
7399
+
7400
+ # VIF for multicollinearity check
7401
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7402
+ vif_data = pd.DataFrame()
7403
+ res_qc["vif"]=vif_data
7404
+ if numeric_df.shape[1] > 1:
7405
+ vif_data["feature"] = numeric_df.columns
7406
+ vif_data["VIF"] = [
7407
+ variance_inflation_factor(numeric_df.values, i)
7408
+ for i in range(numeric_df.shape[1])
7409
+ ]
7410
+ res_qc["vif"] = vif_data[
7411
+ vif_data["VIF"] > 5
7412
+ ] # Typically VIF > 5 indicates multicollinearity
7413
+ # Skewness and Kurtosis
7414
+ skewness = data.skew(numeric_only=True)
7415
+ kurtosis_vals = data.kurt(numeric_only=True)
7416
+ res_qc["skewness"] = skewness[abs(skewness) > 1]
7417
+ res_qc["kurtosis"] = kurtosis_vals[abs(kurtosis_vals) > 3]
7418
+
7419
+ # Entropy for categorical columns (higher entropy suggests more disorder)
7420
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
7421
+ res_qc["entropy_categoricals"] = {
7422
+ col: entropy(data[col].value_counts(normalize=True), base=2)
7423
+ for col in categorical_cols
7424
+ }
7425
+ # number of unique
7426
+ res_qc["unique_counts"] = data.nunique()
7427
+ # dtypes counts
7428
+ res_qc['dtype_counts']=data.dtypes.value_counts()
7429
+
7430
+ # Distribution Analysis (mean, median, mode, std dev, IQR for numeric columns)
7431
+ distribution_stats = data.select_dtypes(include=[np.number]).describe().T
7432
+ iqr = data.select_dtypes(include=[np.number]).apply(
7433
+ lambda x: x.quantile(0.75) - x.quantile(0.25)
7434
+ )
7435
+ distribution_stats["IQR"] = iqr
7436
+ res_qc["distribution_analysis"] = distribution_stats
7437
+
7438
+ # Variance Check: Identify low-variance columns
7439
+ variance_threshold = 0.01
7440
+ low_variance_cols = [
7441
+ col
7442
+ for col in data.select_dtypes(include=[np.number]).columns
7443
+ if data[col].var() < variance_threshold
7444
+ ]
7445
+ res_qc["low_variance_features"] = low_variance_cols
7446
+
7447
+ # Categorical columns and cardinality
7448
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
7449
+ high_cardinality = {
7450
+ col: data[col].nunique() for col in categorical_cols if data[col].nunique() > 50
7451
+ }
7452
+ res_qc["high_cardinality_categoricals"] = high_cardinality
7453
+
7454
+ # Feature-type inconsistency (mixed types in columns)
7455
+ inconsistent_types = {}
7456
+ for col in data.columns:
7457
+ unique_types = set(type(val) for val in data[col].dropna())
7458
+ if len(unique_types) > 1:
7459
+ inconsistent_types[col] = unique_types
7460
+ res_qc["inconsistent_types"] = inconsistent_types
7461
+
7462
+
7463
+ # Text length analysis for text fields
7464
+ text_lengths = {}
7465
+ for col in categorical_cols:
7466
+ text_lengths[col] = {
7467
+ "avg_length": data[col].dropna().apply(len).mean(),
7468
+ "length_variance": data[col].dropna().apply(len).var(),
7469
+ }
7470
+ res_qc["text_length_analysis"] = text_lengths
7471
+
7472
+ # Summary statistics
7473
+ res_qc["summary_statistics"] = data.describe().T
7474
+
7475
+ # Automated warnings
7476
+ warnings = []
7477
+ if res_qc["duplicate_rows"] > 0:
7478
+ warnings.append("Warning: Duplicate rows detected.")
7479
+ if len(res_qc["empty_columns"]) > 0:
7480
+ warnings.append("Warning: Columns with only NaN values detected.")
7481
+ if len(res_qc["constant_columns"]) > 0:
7482
+ warnings.append("Warning: Columns with a single constant value detected.")
7483
+ if len(high_corr_pairs) > 0:
7484
+ warnings.append("Warning: Highly correlated columns detected.")
7485
+ if len(res_qc["vif"]) > 0:
7486
+ warnings.append("Warning: Multicollinearity detected in features.")
7487
+ if len(high_cardinality) > 0:
7488
+ warnings.append("Warning: High cardinality in categorical columns.")
7489
+ if len(inconsistent_types) > 0:
7490
+ warnings.append("Warning: Columns with mixed data types detected.")
7491
+ res_qc["warnings"] = warnings
7492
+
7493
+ # Report generation
7494
+ if verbose:
7495
+ print("=== QC Report Summary ===")
7496
+ print("\nMissing Values (Total and %):")
7497
+ print(res_qc["missing_values"][res_qc["missing_values"] > 0])
7498
+ print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7499
+
7500
+ print("\nRows with Missing Values:", res_qc["rows_with_missing"])
7501
+
7502
+ print("\nData Types:")
7503
+ print(res_qc["data_types"])
7504
+
7505
+ print("\nUnique Values per Column:")
7506
+ print(res_qc["unique_values"])
7507
+
7508
+ print("\nConstant Columns:", res_qc["constant_columns"])
7509
+
7510
+ print("\nDuplicate Rows:", res_qc["duplicate_rows"])
7511
+ print("Duplicate Columns:", res_qc["duplicate_columns"])
7512
+
7513
+ if res_qc["empty_columns"]:
7514
+ print("\nEmpty Columns:", res_qc["empty_columns"])
7515
+
7516
+ print("\nOutlier Report:")
7517
+ print(res_qc["outlier_num"])
7518
+ print("\nPercentage of Values Replaced per Column:")
7519
+ print(res_qc["outlier_percentage"])
7520
+
7521
+ print("\nHigh Correlations (>|0.9|):")
7522
+ for col1, col2 in res_qc["high_correlations"]:
7523
+ print(f" {col1} and {col2}")
7524
+
7525
+ if "vif" in res_qc:
7526
+ print("\nFeatures with High VIF (>|5|):")
7527
+ print(res_qc["vif"])
7528
+
7529
+ print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
7530
+ print(res_qc["high_cardinality_categoricals"])
7531
+
7532
+ print("\nInconsistent Data Types:")
7533
+ print(res_qc["inconsistent_types"])
7534
+
7535
+ print("\nRange Checks for Numeric Columns:")
7536
+ print(res_qc["range_checks"])
7537
+
7538
+ print("\nText Length Analysis:")
7539
+ for col, stats in res_qc["text_length_analysis"].items():
7540
+ print(
7541
+ f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
7542
+ )
7543
+
7544
+ print("\nSummary Statistics:")
7545
+ print(res_qc["summary_statistics"])
7129
7546
 
7547
+ if res_qc["warnings"]:
7548
+ print("\nWarnings:")
7549
+ for warning in res_qc["warnings"]:
7550
+ print(" -", warning)
7551
+ if plot_:
7552
+ df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
7553
+ if output:
7554
+ return res_qc
7555
+ return None
7556
+
7557
+
7558
+ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
7559
+ import matplotlib.pyplot as plt
7560
+ import seaborn as sns
7561
+ from .plot import subplot, figsets, get_color
7562
+
7563
+ if columns is not None:
7564
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
7565
+ data=data[columns]
7566
+ len_total = len(res_qc)
7567
+ n_row, n_col = int((len_total + 10) / 3), 3
7568
+ nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7569
+
7570
+ missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
7571
+ ascending=False
7572
+ )
7573
+ if len(missing_data) > max_cols:
7574
+ missing_data = missing_data[:max_cols]
7575
+ ax=sns.barplot(
7576
+ x=missing_data.index,
7577
+ y=missing_data.values,
7578
+ hue=missing_data.index,
7579
+ palette=get_color(len(missing_data), cmap="Blues")[::-1],
7580
+ ax=nexttile(),
7581
+ )
7582
+ figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
7583
+
7584
+ ax2 = ax.twinx()
7585
+ # Plot missing value percentages
7586
+ missing_percentage = res_qc["missing_percentage"][
7587
+ res_qc["missing_percentage"] > 0
7588
+ ].sort_values(ascending=False)
7589
+ sns.barplot(
7590
+ x=missing_percentage.index,
7591
+ y=missing_percentage.values,
7592
+ hue=missing_percentage.index,
7593
+ palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
7594
+ ax=ax2,#nexttile(),
7595
+ )
7596
+ figsets(xangle=45, ylabel="%",ax=ax2)
7597
+ ax2.tick_params(axis="y", color='r',labelcolor='r')
7598
+ ax2.yaxis.label.set_color('r')
7599
+
7600
+ outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
7601
+ if len(outlier_num) > max_cols:
7602
+ outlier_num = outlier_num[:max_cols]
7603
+ ax_outlier_num=sns.barplot(
7604
+ x=outlier_num.index,
7605
+ y=outlier_num.values,
7606
+ hue=outlier_num.index,
7607
+ palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
7608
+ ax=nexttile(),
7609
+ )
7610
+ figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
7611
+ ax_outlier_percentage = ax_outlier_num.twinx()
7612
+ outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
7613
+ if len(outlier_percentage) > max_cols:
7614
+ outlier_percentage = outlier_percentage[:max_cols]
7615
+ ax_outlier_percentage=sns.barplot(
7616
+ x=outlier_percentage.index,
7617
+ y=outlier_percentage.values,
7618
+ hue=outlier_percentage.index,
7619
+ palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
7620
+ ax=ax2 #nexttile(),
7621
+ )
7622
+ figsets(
7623
+ xangle=45,
7624
+ ylabel="%",
7625
+ xlabel=None,
7626
+ ylim=[0, outlier_percentage.max() + 2],
7627
+ ax=ax_outlier_percentage
7628
+ )
7629
+ ax2.tick_params(axis="y", color='r',labelcolor='r')
7630
+ ax2.yaxis.label.set_color('r')
7631
+
7632
+ # Skewness and Kurtosis Plots
7633
+ skewness = res_qc["skewness"].sort_values(ascending=False)
7634
+ kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
7635
+ if not skewness.empty:
7636
+ ax_skewness=sns.barplot(
7637
+ x=skewness.index,
7638
+ y=skewness.values,
7639
+ hue=skewness.index,
7640
+ palette=get_color(len(skewness), cmap="coolwarm")[::-1],
7641
+ ax=nexttile(),
7642
+ )
7643
+ figsets(
7644
+ xangle=45,
7645
+ title="Highly Skewed Numeric Columns (Skewness > 1)",
7646
+ ylabel="Skewness",xlabel=None,ax=ax_skewness
7647
+ )
7648
+ if not kurtosis.empty:
7649
+ ax_kurtosis=sns.barplot(
7650
+ x=kurtosis.index,
7651
+ y=kurtosis.values,
7652
+ hue=kurtosis.index,
7653
+ palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
7654
+ ax=nexttile(),
7655
+ )
7656
+ figsets(
7657
+ xangle=45,
7658
+ title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
7659
+ ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
7660
+ )
7661
+
7662
+ # Entropy for Categorical Variables
7663
+ entropy_data = pd.Series(res_qc["entropy_categoricals"]).sort_values(
7664
+ ascending=False
7665
+ )
7666
+ ax_entropy_data=sns.barplot(
7667
+ x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
7668
+ )
7669
+ figsets(
7670
+ xangle=45,
7671
+ xlabel="Categorical Columns",
7672
+ title="Entropy of Categorical Variables",
7673
+ ylabel="Entropy (bits)",
7674
+ ax=ax_entropy_data
7675
+ )
7676
+ # Distribution Analysis: Boxplot for IQR
7677
+ ax_iqr=sns.boxplot(
7678
+ data=data[res_qc["distribution_analysis"].index],
7679
+ orient="v",
7680
+ palette="Set3",
7681
+ ax=nexttile(),
7682
+ )
7683
+ figsets(
7684
+ xangle=45,
7685
+ title="Range for Numeric Columns",
7686
+ ylabel="#",
7687
+ ax=ax_iqr
7688
+ )
7689
+ # unique counts
7690
+ unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
7691
+ ax_unique_counts_=sns.barplot(
7692
+ x=unique_counts.index,
7693
+ y=unique_counts.values,
7694
+ hue=unique_counts.index,
7695
+ palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
7696
+ ax=nexttile())
7697
+ figsets(
7698
+ xangle=45,
7699
+ title="Unique Counts",
7700
+ xlabel=None,
7701
+ ylabel="#",
7702
+ ax=ax_unique_counts_
7703
+ )
7704
+ # Binary Checking
7705
+ ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
7706
+ y=unique_counts[unique_counts<10].values,
7707
+ hue=unique_counts[unique_counts<10].index,
7708
+ palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
7709
+ ax=nexttile())
7710
+ plt.axhline(y=2, color="r", linestyle="--", lw=2)
7711
+ figsets(
7712
+ xangle=45,
7713
+ xlabel=None,
7714
+ title="Binary Checking",
7715
+ ylabel="#",
7716
+ ax=ax_unique_counts
7717
+ )
7718
+
7719
+ # dtypes counts
7720
+ dtype_counts = res_qc['dtype_counts']
7721
+ txt = []
7722
+ for tp in dtype_counts.index:
7723
+ txt.append(list(data.select_dtypes(include=tp).columns))
7724
+
7725
+ ax_dtype_counts = sns.barplot(
7726
+ x=dtype_counts.index,
7727
+ y=dtype_counts.values,
7728
+ color="#F3C8B2",
7729
+ ax=nexttile(),
7730
+ )
7731
+ max_columns_per_row = 1 # Maximum number of columns per row
7732
+ for i, tp in enumerate(dtype_counts.index):
7733
+ if i<=20:
7734
+ column_names = txt[i]
7735
+ # Split the column names into multiple lines if too long
7736
+ column_name_str = ", ".join(column_names)
7737
+ if len(column_name_str) > 40: # If column names are too long, split them
7738
+ column_name_str = "\n".join(
7739
+ [
7740
+ ", ".join(column_names[j : j + max_columns_per_row])
7741
+ for j in range(0, len(column_names), max_columns_per_row)
7742
+ ]
7743
+ )
7744
+ # Place text annotation with line breaks and rotate the text if needed
7745
+ ax_dtype_counts.text(
7746
+ i,
7747
+ dtype_counts.values[i],
7748
+ f"{column_name_str}",
7749
+ ha="center",
7750
+ va="top",
7751
+ c="k",
7752
+ fontsize=8,
7753
+ rotation=0,
7754
+ )
7755
+ figsets(
7756
+ xlabel=None,
7757
+ title="Dtypes",
7758
+ ylabel="#",
7759
+ ax=ax_dtype_counts
7760
+ )
7761
+
7762
+ # High cardinality: Show top categorical columns by unique value count
7763
+ high_cardinality = res_qc["high_cardinality_categoricals"]
7764
+ if high_cardinality and len(high_cardinality) > max_cols:
7765
+ high_cardinality = dict(
7766
+ sorted(high_cardinality.items(), key=lambda x: x[1], reverse=True)[
7767
+ :max_cols
7768
+ ]
7769
+ )
7770
+
7771
+ if high_cardinality:
7772
+ ax_high_cardinality=sns.barplot(
7773
+ x=list(high_cardinality.keys()),
7774
+ y=list(high_cardinality.values()),
7775
+ hue=list(high_cardinality.keys()),
7776
+ palette="Oranges", ax=nexttile()
7777
+ )
7778
+ figsets(
7779
+ xangle=45,
7780
+ title="High Cardinality Categorical Columns",
7781
+ ylabel="Unique Value Count",
7782
+ ax=ax_high_cardinality
7783
+ )
7784
+ if res_qc["low_variance_features"]:
7785
+ low_variance_data = data[res_qc["low_variance_features"]].copy()
7786
+ for col in low_variance_data.columns:
7787
+ sns.histplot(
7788
+ low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
7789
+ )
7790
+ plt.title(f"Low Variance Feature: {col}")
7791
+
7792
+ # VIF plot for multicollinearity detection
7793
+ if "vif" in res_qc and not res_qc["vif"].empty:
7794
+ vif_data = res_qc["vif"].sort_values(by="VIF", ascending=False)
7795
+ if len(vif_data) > max_cols:
7796
+ vif_data = vif_data[:max_cols]
7797
+ ax_vif=sns.barplot(data=vif_data,
7798
+ x="VIF",
7799
+ y="feature",
7800
+ hue="VIF",
7801
+ palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
7802
+ ax=nexttile())
7803
+ figsets(
7804
+ xangle=45,
7805
+ title="Variance Inflation Factor(VIF)",
7806
+ xlabel="Variance Inflation Factor(VIF)",
7807
+ ylabel="Features",
7808
+ legend=None,
7809
+ ax=ax_vif
7810
+ )
7811
+
7812
+ # Correlation heatmap for numeric columns with high correlation pairs
7813
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7814
+ corr = data.select_dtypes(include=[np.number]).dropna().corr()
7815
+ if corr.shape[1]<=33:
7816
+ mask = np.triu(np.ones_like(corr, dtype=bool))
7817
+ # Dynamically scale fontsize based on the number of columns
7818
+ num_columns = corr.shape[1]
7819
+ fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
7820
+
7821
+ ax_heatmap=sns.heatmap(
7822
+ corr,
7823
+ mask=mask,
7824
+ annot=True,
7825
+ cmap="coolwarm",
7826
+ center=0,
7827
+ fmt=".2f",
7828
+ linewidths=0.5,
7829
+ vmin=-1, vmax=1,
7830
+ ax=nexttile(2, 2),
7831
+ cbar_kws=dict(shrink=0.2,ticks=np.arange(-1, 2, 1)),
7832
+ annot_kws={"size": fontsize}
7833
+ )
7834
+
7835
+ figsets(
7836
+ xangle=45,
7837
+ title="Correlation Heatmap",
7838
+ ax=ax_heatmap
7839
+ )
7130
7840
 
7131
7841
  def use_pd(
7132
7842
  func_name="excel",