py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -3163,14 +3163,19 @@ def listdir(
3163
3163
  if kind is None:
3164
3164
  ls = os.listdir(rootdir)
3165
3165
  ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
3166
- print(ls)
3166
+ if verbose:
3167
+ if len(ls)>20:
3168
+ print(ls[:20])
3169
+ else:
3170
+ print(ls)
3167
3171
  df_all = pd.DataFrame(
3168
3172
  {
3169
3173
  "fname": ls,
3170
3174
  "fpath": [os.path.join(rootdir, i) for i in ls],
3171
3175
  }
3172
3176
  )
3173
- display(df_all)
3177
+ if verbose:
3178
+ display(df_all.head())
3174
3179
  return df_all
3175
3180
  if isinstance(kind, list):
3176
3181
  f_ = []
@@ -3206,6 +3211,7 @@ def listdir(
3206
3211
  "size": [],
3207
3212
  "fname": [],
3208
3213
  "fpath": [],
3214
+ "basename":[],
3209
3215
  }
3210
3216
  for item in ls:
3211
3217
  item_path = os.path.join(rootdir, item)
@@ -3228,6 +3234,7 @@ def listdir(
3228
3234
  f["length"].append(len(filename))
3229
3235
  f["path"].append(os.path.join(os.path.dirname(item_path), item))
3230
3236
  fpath = os.path.join(os.path.dirname(item_path), item)
3237
+ basename=os.path.basename(item_path)
3231
3238
  f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
3232
3239
  f["created_time"].append(
3233
3240
  pd.to_datetime(os.path.getctime(item_path), unit="s")
@@ -3240,6 +3247,7 @@ def listdir(
3240
3247
  )
3241
3248
  f["fname"].append(filename) # will be removed
3242
3249
  f["fpath"].append(fpath) # will be removed
3250
+ f['basename'].append(basename)
3243
3251
  i += 1
3244
3252
 
3245
3253
  f["num"] = i
@@ -3462,7 +3470,6 @@ def figsave(*args, dpi=300):
3462
3470
  img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
3463
3471
  elif isinstance(img, np.ndarray):
3464
3472
  import cv2
3465
-
3466
3473
  # Check the shape of the image to determine color mode
3467
3474
  if img.ndim == 2:
3468
3475
  # Grayscale image
@@ -5055,16 +5062,22 @@ def _df_outlier(
5055
5062
  from scipy.stats import zscore
5056
5063
  from sklearn.ensemble import IsolationForest
5057
5064
  from sklearn.preprocessing import StandardScaler
5058
-
5065
+
5066
+ # Fill completely NaN columns with a default value (e.g., 0)
5067
+ data = data.copy()
5068
+ data.loc[:, data.isna().all()] = 0
5069
+ if columns is not None:
5070
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
5071
+ data=data[columns]
5059
5072
  col_names_org = data.columns.tolist()
5060
5073
  index_names_org = data.index.tolist()
5061
5074
  # Separate numeric and non-numeric columns
5062
5075
  numeric_data = data.select_dtypes(include=[np.number])
5063
5076
  non_numeric_data = data.select_dtypes(exclude=[np.number])
5064
5077
 
5065
- if columns is not None:
5066
- numeric_data = numeric_data[columns]
5067
- elif numeric_data.empty:
5078
+ # if columns is not None:
5079
+ # numeric_data = numeric_data[columns]
5080
+ if numeric_data.empty:
5068
5081
  raise ValueError("Input data must contain numeric columns.")
5069
5082
 
5070
5083
  outliers_df = pd.DataFrame(index=numeric_data.index)
@@ -5626,6 +5639,10 @@ def df_fillna(
5626
5639
  for col in data.columns:
5627
5640
  data[col] = data[col].apply(lambda x: np.nan if x is None else x)
5628
5641
 
5642
+ # Fill completely NaN columns with a default value (e.g., 0)
5643
+ data = data.copy()
5644
+ data.loc[:, data.isna().all()] = 0
5645
+
5629
5646
  col_names_org = data.columns.tolist()
5630
5647
  index_names_org = data.index.tolist()
5631
5648
  # Separate numeric and non-numeric columns
@@ -5682,11 +5699,11 @@ def df_fillna(
5682
5699
  imputed_data = imputer.fit_transform(numeric_data.T)
5683
5700
  else:
5684
5701
  raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
5685
-
5702
+
5686
5703
  imputed_data = pd.DataFrame(
5687
5704
  imputed_data if axis == 0 else imputed_data.T,
5688
- index=numeric_data.index if axis == 0 else data.columns,
5689
- columns=numeric_data.columns if axis == 0 else data.index,
5705
+ index=numeric_data.index if axis == 0 else numeric_data.columns,
5706
+ columns=numeric_data.columns if axis == 0 else numeric_data.index,
5690
5707
  )
5691
5708
  for col in imputed_data.select_dtypes(include=[np.number]).columns:
5692
5709
  imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
@@ -5826,8 +5843,13 @@ def df_encoder(
5826
5843
  from sklearn.preprocessing import LabelEncoder
5827
5844
 
5828
5845
  encoder = LabelEncoder()
5829
- encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
5830
- return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5846
+ # Apply LabelEncoder only to non-numeric columns
5847
+ non_numeric_columns = [col for col in columns if not pd.api.types.is_numeric_dtype(data[col])]
5848
+
5849
+ if not non_numeric_columns:
5850
+ return data
5851
+ encoded_data = data[non_numeric_columns].apply(lambda col: encoder.fit_transform(col))
5852
+ return pd.concat([data.drop(non_numeric_columns, axis=1), encoded_data], axis=1)
5831
5853
 
5832
5854
  # Target encoding (Mean of the target for each category)
5833
5855
  elif method == "target":
@@ -6878,7 +6900,188 @@ def df_reducer(
6878
6900
  # example:
6879
6901
  # df_reducer(data=data_log, columns=markers, n_components=2)
6880
6902
 
6903
+ def df_format(data, threshold_unique=0.5, verbose=False):
6904
+ """
6905
+ 检测表格: long, wide or uncertain.
6906
+
6907
+ Parameters:
6908
+ - data (pd.DataFrame): DataFrame to check.
6909
+ - threshold_unique (float): Proportion threshold for detecting categorical columns.
6910
+
6911
+ Returns:
6912
+ - "long" if detected as long format,
6913
+ - "wide" if detected as wide format
6914
+ - "uncertain" if ambiguous.
6915
+ """
6916
+ from scipy.stats import entropy
6917
+ from sklearn.cluster import AgglomerativeClustering
6918
+ from sklearn.preprocessing import StandardScaler
6919
+
6920
+ long_score = 0
6921
+ wide_score = 0
6922
+
6923
+ n_rows, n_cols = data.shape
6924
+
6925
+ # Step 1: Row-Column Ratio Heuristic
6926
+ if n_rows > 3 * n_cols:
6927
+ long_score += 2
6928
+ if verbose:
6929
+ print(
6930
+ "Row-Column Ratio suggests long format (many rows relative to columns)."
6931
+ )
6932
+ elif n_cols > 3 * n_rows:
6933
+ wide_score += 2
6934
+ if verbose:
6935
+ print(
6936
+ "Row-Column Ratio suggests wide format (many columns relative to rows)."
6937
+ )
6938
+
6939
+ # Step 2: Unique-to-duplicate ratio and entropy for categorical variables
6940
+ unique_counts = data.apply(lambda x: x.nunique())
6941
+ duplicate_ratio = 1 - unique_counts / n_rows
6942
+ if (duplicate_ratio > 0.2).sum() > 0.5 * n_cols:
6943
+ wide_score += 2
6944
+ if verbose:
6945
+ print("High duplicate values in columns suggest wide format.")
6946
+ else:
6947
+ long_score += 1
6948
+ if verbose:
6949
+ print(
6950
+ "Lower duplicate ratio suggests long format (higher row variability)."
6951
+ )
6881
6952
 
6953
+ # Calculate entropy for categorical columns
6954
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
6955
+ if len(categorical_cols) > 0:
6956
+ for col in categorical_cols:
6957
+ counts = data[col].value_counts(normalize=True)
6958
+ col_entropy = entropy(counts)
6959
+ if col_entropy < 1.5:
6960
+ long_score += 1
6961
+ if verbose:
6962
+ print(
6963
+ f"Column '{col}' entropy suggests categorical, supporting long format."
6964
+ )
6965
+ else:
6966
+ wide_score += 1
6967
+ if verbose:
6968
+ print(f"Column '{col}' entropy is higher, supporting wide format.")
6969
+
6970
+ # Step 3: Column grouping analysis for patterns in suffixes/prefixes
6971
+ col_names = data.columns.astype(str)
6972
+ suffix_count = sum("_" in col or col[-1].isdigit() for col in col_names)
6973
+ if suffix_count > 0.3 * n_cols:
6974
+ wide_score += 2
6975
+ if verbose:
6976
+ print(
6977
+ "Detected suffix/prefix patterns in column names, suggesting wide format."
6978
+ )
6979
+
6980
+ # Step 4: Entity identifier detection for long format with categorical columns
6981
+ if len(categorical_cols) > 0 and n_rows > n_cols:
6982
+ entity_identifier_count = sum(
6983
+ data.duplicated(subset=categorical_cols, keep=False)
6984
+ )
6985
+ if entity_identifier_count > 0.2 * n_rows:
6986
+ long_score += 2
6987
+ if verbose:
6988
+ print(
6989
+ "Significant duplicate rows based on categorical columns, suggesting long format."
6990
+ )
6991
+
6992
+ # Step 5: Clustering analysis on numerical columns for correlation in wide format
6993
+ numeric_cols = data.select_dtypes(include="number").columns
6994
+ if len(numeric_cols) > 1:
6995
+ scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
6996
+ clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
6997
+ cluster_labels = pd.Series(clustering.labels_)
6998
+ if cluster_labels.nunique() < len(numeric_cols) * 0.5:
6999
+ wide_score += 2
7000
+ if verbose:
7001
+ print("Clustering on columns shows grouping, suggesting wide format.")
7002
+
7003
+ # Step 6: Inter-column correlation analysis
7004
+ if len(numeric_cols) > 1:
7005
+ corr_matrix = data[numeric_cols].corr().abs()
7006
+ avg_corr = (
7007
+ corr_matrix.where(~np.eye(len(corr_matrix), dtype=bool)).mean().mean()
7008
+ )
7009
+ if avg_corr > 0.6:
7010
+ wide_score += 2
7011
+ if verbose:
7012
+ print("High inter-column correlation suggests wide format.")
7013
+
7014
+ # Step 7: Missing value pattern analysis
7015
+ missing_patterns = data.isna().sum(axis=1)
7016
+ if missing_patterns.std() < 2:
7017
+ wide_score += 1
7018
+ if verbose:
7019
+ print(
7020
+ "Low variation in missing patterns across rows, supporting wide format."
7021
+ )
7022
+ elif missing_patterns.mean() < 1:
7023
+ long_score += 1
7024
+ if verbose:
7025
+ print("Lower missing pattern suggests long format (less structured).")
7026
+
7027
+ # Step 8: Multi-level clustering on rows to detect block structure for wide format
7028
+ if len(numeric_cols) > 1 and n_rows > 5:
7029
+ clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
7030
+ if pd.Series(clustering_rows.labels_).nunique() < 2:
7031
+ wide_score += 2
7032
+ if verbose:
7033
+ print("Row clustering reveals homogeneity, suggesting wide format.")
7034
+
7035
+ # Step 9: Sequential name detection for time-series pattern in wide format
7036
+ if any(col.isdigit() or col.startswith("T") for col in col_names):
7037
+ wide_score += 1
7038
+ if verbose:
7039
+ print("Detected time-like sequential column names, supporting wide format.")
7040
+
7041
+ # Step 10: Entropy of numeric columns
7042
+ numeric_entropy = data[numeric_cols].apply(
7043
+ lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
7044
+ )
7045
+ if numeric_entropy.mean() < 2:
7046
+ wide_score += 2
7047
+ if verbose:
7048
+ print(
7049
+ "Low entropy in numeric columns indicates stability across columns, supporting wide format."
7050
+ )
7051
+
7052
+ # Step 11: Tie-breaking strategy if scores are equal
7053
+ if wide_score == long_score:
7054
+ if n_cols > n_rows:
7055
+ wide_score += 1
7056
+ if verbose:
7057
+ print(
7058
+ "Tie-breaking based on column-major structure, favoring wide format."
7059
+ )
7060
+ elif n_rows > n_cols:
7061
+ long_score += 1
7062
+ if verbose:
7063
+ print(
7064
+ "Tie-breaking based on row-major structure, favoring long format."
7065
+ )
7066
+ else:
7067
+ if verbose:
7068
+ print("Tie-breaking inconclusive; returning 'uncertain'.")
7069
+ return "uncertain"
7070
+
7071
+ # Final decision
7072
+ if wide_score > long_score:
7073
+ if verbose:
7074
+ print("Final decision: Wide format.")
7075
+ return "wide"
7076
+ elif long_score > wide_score:
7077
+ if verbose:
7078
+ print("Final decision: Long format.")
7079
+ return "long"
7080
+ else:
7081
+ if verbose:
7082
+ print("Final decision: Uncertain format.")
7083
+ return "uncertain"
7084
+
6882
7085
  def plot_cluster(
6883
7086
  data: pd.DataFrame,
6884
7087
  labels: np.ndarray,
@@ -7126,7 +7329,514 @@ def evaluate_cluster(
7126
7329
  metrics["V-Measure"] = np.nan
7127
7330
 
7128
7331
  return metrics
7332
+ def df_qc(
7333
+ data: pd.DataFrame,
7334
+ columns=None,
7335
+ verbose=False,
7336
+ plot_=True,
7337
+ max_cols=20, # only for plots
7338
+ output=False,
7339
+ ):
7340
+ """
7341
+ Usage example:
7342
+ df = pd.DataFrame(...) # Your DataFrameres_qc = df_qc(df)
7343
+ """
7344
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
7345
+ from scipy.stats import skew, kurtosis, entropy
7346
+ import skimpy
7347
+
7348
+ #! display(data.select_dtypes(include=[np.number]).describe())
7349
+ #!skim
7350
+ if columns is not None:
7351
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
7352
+ data=data[columns]
7353
+ try:
7354
+ skimpy.skim(data)
7355
+ except:
7356
+ numerical_data = data.select_dtypes(include=[np.number])
7357
+ skimpy.skim(numerical_data)
7358
+ # Fill completely NaN columns with a default value (e.g., 0)
7359
+ data = data.copy()
7360
+ data.loc[:, data.isna().all()] = 0
7361
+ res_qc = {}
7362
+
7363
+ # Missing values
7364
+ res_qc["missing_values"] = data.isnull().sum()
7365
+ res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
7366
+ res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
7367
+
7368
+ # Data types and unique values
7369
+ res_qc["data_types"] = data.dtypes
7370
+ res_qc["unique_values"] = data.nunique()
7371
+ res_qc["constant_columns"] = [
7372
+ col for col in data.columns if data[col].nunique() <= 1
7373
+ ]
7374
+
7375
+ # Duplicate rows and columns
7376
+ res_qc["duplicate_rows"] = data.duplicated().sum()
7377
+ res_qc["duplicate_columns"] = data.columns[data.columns.duplicated()].tolist()
7378
+
7379
+ # Empty columns
7380
+ res_qc["empty_columns"] = [col for col in data.columns if data[col].isnull().all()]
7381
+
7382
+ # outliers
7383
+ data_outliers = df_outlier(data)
7384
+ outlier_num = data_outliers.isna().sum() - data.isnull().sum()
7385
+ res_qc["outlier_num"] = outlier_num[outlier_num > 0]
7386
+ outlier_percentage=(outlier_num / len(data_outliers)) * 100
7387
+ res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
7388
+ # Correlation and multicollinearity (VIF)
7389
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7390
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7391
+ corr_matrix = numeric_df.corr()
7392
+ high_corr_pairs = [
7393
+ (col1, col2)
7394
+ for col1 in corr_matrix.columns
7395
+ for col2 in corr_matrix.columns
7396
+ if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7397
+ ]
7398
+ res_qc["high_correlations"] = high_corr_pairs
7399
+
7400
+ # VIF for multicollinearity check
7401
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7402
+ vif_data = pd.DataFrame()
7403
+ res_qc["vif"]=vif_data
7404
+ if numeric_df.shape[1] > 1:
7405
+ vif_data["feature"] = numeric_df.columns
7406
+ vif_data["VIF"] = [
7407
+ variance_inflation_factor(numeric_df.values, i)
7408
+ for i in range(numeric_df.shape[1])
7409
+ ]
7410
+ res_qc["vif"] = vif_data[
7411
+ vif_data["VIF"] > 5
7412
+ ] # Typically VIF > 5 indicates multicollinearity
7413
+ # Skewness and Kurtosis
7414
+ skewness = data.skew(numeric_only=True)
7415
+ kurtosis_vals = data.kurt(numeric_only=True)
7416
+ res_qc["skewness"] = skewness[abs(skewness) > 1]
7417
+ res_qc["kurtosis"] = kurtosis_vals[abs(kurtosis_vals) > 3]
7418
+
7419
+ # Entropy for categorical columns (higher entropy suggests more disorder)
7420
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
7421
+ res_qc["entropy_categoricals"] = {
7422
+ col: entropy(data[col].value_counts(normalize=True), base=2)
7423
+ for col in categorical_cols
7424
+ }
7425
+ # number of unique
7426
+ res_qc["unique_counts"] = data.nunique()
7427
+ # dtypes counts
7428
+ res_qc['dtype_counts']=data.dtypes.value_counts()
7429
+
7430
+ # Distribution Analysis (mean, median, mode, std dev, IQR for numeric columns)
7431
+ distribution_stats = data.select_dtypes(include=[np.number]).describe().T
7432
+ iqr = data.select_dtypes(include=[np.number]).apply(
7433
+ lambda x: x.quantile(0.75) - x.quantile(0.25)
7434
+ )
7435
+ distribution_stats["IQR"] = iqr
7436
+ res_qc["distribution_analysis"] = distribution_stats
7437
+
7438
+ # Variance Check: Identify low-variance columns
7439
+ variance_threshold = 0.01
7440
+ low_variance_cols = [
7441
+ col
7442
+ for col in data.select_dtypes(include=[np.number]).columns
7443
+ if data[col].var() < variance_threshold
7444
+ ]
7445
+ res_qc["low_variance_features"] = low_variance_cols
7446
+
7447
+ # Categorical columns and cardinality
7448
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
7449
+ high_cardinality = {
7450
+ col: data[col].nunique() for col in categorical_cols if data[col].nunique() > 50
7451
+ }
7452
+ res_qc["high_cardinality_categoricals"] = high_cardinality
7453
+
7454
+ # Feature-type inconsistency (mixed types in columns)
7455
+ inconsistent_types = {}
7456
+ for col in data.columns:
7457
+ unique_types = set(type(val) for val in data[col].dropna())
7458
+ if len(unique_types) > 1:
7459
+ inconsistent_types[col] = unique_types
7460
+ res_qc["inconsistent_types"] = inconsistent_types
7461
+
7462
+
7463
+ # Text length analysis for text fields
7464
+ text_lengths = {}
7465
+ for col in categorical_cols:
7466
+ text_lengths[col] = {
7467
+ "avg_length": data[col].dropna().apply(len).mean(),
7468
+ "length_variance": data[col].dropna().apply(len).var(),
7469
+ }
7470
+ res_qc["text_length_analysis"] = text_lengths
7471
+
7472
+ # Summary statistics
7473
+ res_qc["summary_statistics"] = data.describe().T
7474
+
7475
+ # Automated warnings
7476
+ warnings = []
7477
+ if res_qc["duplicate_rows"] > 0:
7478
+ warnings.append("Warning: Duplicate rows detected.")
7479
+ if len(res_qc["empty_columns"]) > 0:
7480
+ warnings.append("Warning: Columns with only NaN values detected.")
7481
+ if len(res_qc["constant_columns"]) > 0:
7482
+ warnings.append("Warning: Columns with a single constant value detected.")
7483
+ if len(high_corr_pairs) > 0:
7484
+ warnings.append("Warning: Highly correlated columns detected.")
7485
+ if len(res_qc["vif"]) > 0:
7486
+ warnings.append("Warning: Multicollinearity detected in features.")
7487
+ if len(high_cardinality) > 0:
7488
+ warnings.append("Warning: High cardinality in categorical columns.")
7489
+ if len(inconsistent_types) > 0:
7490
+ warnings.append("Warning: Columns with mixed data types detected.")
7491
+ res_qc["warnings"] = warnings
7492
+
7493
+ # Report generation
7494
+ if verbose:
7495
+ print("=== QC Report Summary ===")
7496
+ print("\nMissing Values (Total and %):")
7497
+ print(res_qc["missing_values"][res_qc["missing_values"] > 0])
7498
+ print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7499
+
7500
+ print("\nRows with Missing Values:", res_qc["rows_with_missing"])
7501
+
7502
+ print("\nData Types:")
7503
+ print(res_qc["data_types"])
7504
+
7505
+ print("\nUnique Values per Column:")
7506
+ print(res_qc["unique_values"])
7507
+
7508
+ print("\nConstant Columns:", res_qc["constant_columns"])
7509
+
7510
+ print("\nDuplicate Rows:", res_qc["duplicate_rows"])
7511
+ print("Duplicate Columns:", res_qc["duplicate_columns"])
7512
+
7513
+ if res_qc["empty_columns"]:
7514
+ print("\nEmpty Columns:", res_qc["empty_columns"])
7515
+
7516
+ print("\nOutlier Report:")
7517
+ print(res_qc["outlier_num"])
7518
+ print("\nPercentage of Values Replaced per Column:")
7519
+ print(res_qc["outlier_percentage"])
7520
+
7521
+ print("\nHigh Correlations (>|0.9|):")
7522
+ for col1, col2 in res_qc["high_correlations"]:
7523
+ print(f" {col1} and {col2}")
7524
+
7525
+ if "vif" in res_qc:
7526
+ print("\nFeatures with High VIF (>|5|):")
7527
+ print(res_qc["vif"])
7528
+
7529
+ print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
7530
+ print(res_qc["high_cardinality_categoricals"])
7531
+
7532
+ print("\nInconsistent Data Types:")
7533
+ print(res_qc["inconsistent_types"])
7534
+
7535
+ print("\nRange Checks for Numeric Columns:")
7536
+ print(res_qc["range_checks"])
7537
+
7538
+ print("\nText Length Analysis:")
7539
+ for col, stats in res_qc["text_length_analysis"].items():
7540
+ print(
7541
+ f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
7542
+ )
7543
+
7544
+ print("\nSummary Statistics:")
7545
+ print(res_qc["summary_statistics"])
7129
7546
 
7547
+ if res_qc["warnings"]:
7548
+ print("\nWarnings:")
7549
+ for warning in res_qc["warnings"]:
7550
+ print(" -", warning)
7551
+ if plot_:
7552
+ df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
7553
+ if output:
7554
+ return res_qc
7555
+ return None
7556
+
7557
+
7558
+ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
7559
+ import matplotlib.pyplot as plt
7560
+ import seaborn as sns
7561
+ from .plot import subplot, figsets, get_color
7562
+
7563
+ if columns is not None:
7564
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
7565
+ data=data[columns]
7566
+ len_total = len(res_qc)
7567
+ n_row, n_col = int((len_total + 10) / 3), 3
7568
+ nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7569
+
7570
+ missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
7571
+ ascending=False
7572
+ )
7573
+ if len(missing_data) > max_cols:
7574
+ missing_data = missing_data[:max_cols]
7575
+ ax=sns.barplot(
7576
+ x=missing_data.index,
7577
+ y=missing_data.values,
7578
+ hue=missing_data.index,
7579
+ palette=get_color(len(missing_data), cmap="Blues")[::-1],
7580
+ ax=nexttile(),
7581
+ )
7582
+ figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
7583
+
7584
+ ax2 = ax.twinx()
7585
+ # Plot missing value percentages
7586
+ missing_percentage = res_qc["missing_percentage"][
7587
+ res_qc["missing_percentage"] > 0
7588
+ ].sort_values(ascending=False)
7589
+ sns.barplot(
7590
+ x=missing_percentage.index,
7591
+ y=missing_percentage.values,
7592
+ hue=missing_percentage.index,
7593
+ palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
7594
+ ax=ax2,#nexttile(),
7595
+ )
7596
+ figsets(xangle=45, ylabel="%",ax=ax2)
7597
+ ax2.tick_params(axis="y", color='r',labelcolor='r')
7598
+ ax2.yaxis.label.set_color('r')
7599
+
7600
+ outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
7601
+ if len(outlier_num) > max_cols:
7602
+ outlier_num = outlier_num[:max_cols]
7603
+ ax_outlier_num=sns.barplot(
7604
+ x=outlier_num.index,
7605
+ y=outlier_num.values,
7606
+ hue=outlier_num.index,
7607
+ palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
7608
+ ax=nexttile(),
7609
+ )
7610
+ figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
7611
+ ax_outlier_percentage = ax_outlier_num.twinx()
7612
+ outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
7613
+ if len(outlier_percentage) > max_cols:
7614
+ outlier_percentage = outlier_percentage[:max_cols]
7615
+ ax_outlier_percentage=sns.barplot(
7616
+ x=outlier_percentage.index,
7617
+ y=outlier_percentage.values,
7618
+ hue=outlier_percentage.index,
7619
+ palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
7620
+ ax=ax2 #nexttile(),
7621
+ )
7622
+ figsets(
7623
+ xangle=45,
7624
+ ylabel="%",
7625
+ xlabel=None,
7626
+ ylim=[0, outlier_percentage.max() + 2],
7627
+ ax=ax_outlier_percentage
7628
+ )
7629
+ ax2.tick_params(axis="y", color='r',labelcolor='r')
7630
+ ax2.yaxis.label.set_color('r')
7631
+
7632
+ # Skewness and Kurtosis Plots
7633
+ skewness = res_qc["skewness"].sort_values(ascending=False)
7634
+ kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
7635
+ if not skewness.empty:
7636
+ ax_skewness=sns.barplot(
7637
+ x=skewness.index,
7638
+ y=skewness.values,
7639
+ hue=skewness.index,
7640
+ palette=get_color(len(skewness), cmap="coolwarm")[::-1],
7641
+ ax=nexttile(),
7642
+ )
7643
+ figsets(
7644
+ xangle=45,
7645
+ title="Highly Skewed Numeric Columns (Skewness > 1)",
7646
+ ylabel="Skewness",xlabel=None,ax=ax_skewness
7647
+ )
7648
+ if not kurtosis.empty:
7649
+ ax_kurtosis=sns.barplot(
7650
+ x=kurtosis.index,
7651
+ y=kurtosis.values,
7652
+ hue=kurtosis.index,
7653
+ palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
7654
+ ax=nexttile(),
7655
+ )
7656
+ figsets(
7657
+ xangle=45,
7658
+ title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
7659
+ ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
7660
+ )
7661
+
7662
+ # Entropy for Categorical Variables
7663
+ entropy_data = pd.Series(res_qc["entropy_categoricals"]).sort_values(
7664
+ ascending=False
7665
+ )
7666
+ ax_entropy_data=sns.barplot(
7667
+ x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
7668
+ )
7669
+ figsets(
7670
+ xangle=45,
7671
+ xlabel="Categorical Columns",
7672
+ title="Entropy of Categorical Variables",
7673
+ ylabel="Entropy (bits)",
7674
+ ax=ax_entropy_data
7675
+ )
7676
+ # Distribution Analysis: Boxplot for IQR
7677
+ ax_iqr=sns.boxplot(
7678
+ data=data[res_qc["distribution_analysis"].index],
7679
+ orient="v",
7680
+ palette="Set3",
7681
+ ax=nexttile(),
7682
+ )
7683
+ figsets(
7684
+ xangle=45,
7685
+ title="Range for Numeric Columns",
7686
+ ylabel="#",
7687
+ ax=ax_iqr
7688
+ )
7689
+ # unique counts
7690
+ unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
7691
+ ax_unique_counts_=sns.barplot(
7692
+ x=unique_counts.index,
7693
+ y=unique_counts.values,
7694
+ hue=unique_counts.index,
7695
+ palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
7696
+ ax=nexttile())
7697
+ figsets(
7698
+ xangle=45,
7699
+ title="Unique Counts",
7700
+ xlabel=None,
7701
+ ylabel="#",
7702
+ ax=ax_unique_counts_
7703
+ )
7704
+ # Binary Checking
7705
+ ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
7706
+ y=unique_counts[unique_counts<10].values,
7707
+ hue=unique_counts[unique_counts<10].index,
7708
+ palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
7709
+ ax=nexttile())
7710
+ plt.axhline(y=2, color="r", linestyle="--", lw=2)
7711
+ figsets(
7712
+ xangle=45,
7713
+ xlabel=None,
7714
+ title="Binary Checking",
7715
+ ylabel="#",
7716
+ ax=ax_unique_counts
7717
+ )
7718
+
7719
+ # dtypes counts
7720
+ dtype_counts = res_qc['dtype_counts']
7721
+ txt = []
7722
+ for tp in dtype_counts.index:
7723
+ txt.append(list(data.select_dtypes(include=tp).columns))
7724
+
7725
+ ax_dtype_counts = sns.barplot(
7726
+ x=dtype_counts.index,
7727
+ y=dtype_counts.values,
7728
+ color="#F3C8B2",
7729
+ ax=nexttile(),
7730
+ )
7731
+ max_columns_per_row = 1 # Maximum number of columns per row
7732
+ for i, tp in enumerate(dtype_counts.index):
7733
+ if i<=20:
7734
+ column_names = txt[i]
7735
+ # Split the column names into multiple lines if too long
7736
+ column_name_str = ", ".join(column_names)
7737
+ if len(column_name_str) > 40: # If column names are too long, split them
7738
+ column_name_str = "\n".join(
7739
+ [
7740
+ ", ".join(column_names[j : j + max_columns_per_row])
7741
+ for j in range(0, len(column_names), max_columns_per_row)
7742
+ ]
7743
+ )
7744
+ # Place text annotation with line breaks and rotate the text if needed
7745
+ ax_dtype_counts.text(
7746
+ i,
7747
+ dtype_counts.values[i],
7748
+ f"{column_name_str}",
7749
+ ha="center",
7750
+ va="top",
7751
+ c="k",
7752
+ fontsize=8,
7753
+ rotation=0,
7754
+ )
7755
+ figsets(
7756
+ xlabel=None,
7757
+ title="Dtypes",
7758
+ ylabel="#",
7759
+ ax=ax_dtype_counts
7760
+ )
7761
+
7762
+ # High cardinality: Show top categorical columns by unique value count
7763
+ high_cardinality = res_qc["high_cardinality_categoricals"]
7764
+ if high_cardinality and len(high_cardinality) > max_cols:
7765
+ high_cardinality = dict(
7766
+ sorted(high_cardinality.items(), key=lambda x: x[1], reverse=True)[
7767
+ :max_cols
7768
+ ]
7769
+ )
7770
+
7771
+ if high_cardinality:
7772
+ ax_high_cardinality=sns.barplot(
7773
+ x=list(high_cardinality.keys()),
7774
+ y=list(high_cardinality.values()),
7775
+ hue=list(high_cardinality.keys()),
7776
+ palette="Oranges", ax=nexttile()
7777
+ )
7778
+ figsets(
7779
+ xangle=45,
7780
+ title="High Cardinality Categorical Columns",
7781
+ ylabel="Unique Value Count",
7782
+ ax=ax_high_cardinality
7783
+ )
7784
+ if res_qc["low_variance_features"]:
7785
+ low_variance_data = data[res_qc["low_variance_features"]].copy()
7786
+ for col in low_variance_data.columns:
7787
+ sns.histplot(
7788
+ low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
7789
+ )
7790
+ plt.title(f"Low Variance Feature: {col}")
7791
+
7792
+ # VIF plot for multicollinearity detection
7793
+ if "vif" in res_qc and not res_qc["vif"].empty:
7794
+ vif_data = res_qc["vif"].sort_values(by="VIF", ascending=False)
7795
+ if len(vif_data) > max_cols:
7796
+ vif_data = vif_data[:max_cols]
7797
+ ax_vif=sns.barplot(data=vif_data,
7798
+ x="VIF",
7799
+ y="feature",
7800
+ hue="VIF",
7801
+ palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
7802
+ ax=nexttile())
7803
+ figsets(
7804
+ xangle=45,
7805
+ title="Variance Inflation Factor(VIF)",
7806
+ xlabel="Variance Inflation Factor(VIF)",
7807
+ ylabel="Features",
7808
+ legend=None,
7809
+ ax=ax_vif
7810
+ )
7811
+
7812
+ # Correlation heatmap for numeric columns with high correlation pairs
7813
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7814
+ corr = data.select_dtypes(include=[np.number]).dropna().corr()
7815
+ if corr.shape[1]<=33:
7816
+ mask = np.triu(np.ones_like(corr, dtype=bool))
7817
+ # Dynamically scale fontsize based on the number of columns
7818
+ num_columns = corr.shape[1]
7819
+ fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
7820
+
7821
+ ax_heatmap=sns.heatmap(
7822
+ corr,
7823
+ mask=mask,
7824
+ annot=True,
7825
+ cmap="coolwarm",
7826
+ center=0,
7827
+ fmt=".2f",
7828
+ linewidths=0.5,
7829
+ vmin=-1, vmax=1,
7830
+ ax=nexttile(2, 2),
7831
+ cbar_kws=dict(shrink=0.2,ticks=np.arange(-1, 2, 1)),
7832
+ annot_kws={"size": fontsize}
7833
+ )
7834
+
7835
+ figsets(
7836
+ xangle=45,
7837
+ title="Correlation Heatmap",
7838
+ ax=ax_heatmap
7839
+ )
7130
7840
 
7131
7841
  def use_pd(
7132
7842
  func_name="excel",