py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.17__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ips.py CHANGED
@@ -2171,6 +2171,8 @@ def fload(fpath, kind=None, **kwargs):
2171
2171
  continue
2172
2172
  else:
2173
2173
  pass
2174
+ if is_df_abnormal(df,verbose=verbose):
2175
+ df=pd.read_csv(fpath,**kwargs)
2174
2176
  display(df.head(2))
2175
2177
  print(f"shape: {df.shape}")
2176
2178
  return df
@@ -3163,14 +3165,19 @@ def listdir(
3163
3165
  if kind is None:
3164
3166
  ls = os.listdir(rootdir)
3165
3167
  ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
3166
- print(ls)
3168
+ if verbose:
3169
+ if len(ls)>20:
3170
+ print(ls[:20])
3171
+ else:
3172
+ print(ls)
3167
3173
  df_all = pd.DataFrame(
3168
3174
  {
3169
3175
  "fname": ls,
3170
3176
  "fpath": [os.path.join(rootdir, i) for i in ls],
3171
3177
  }
3172
3178
  )
3173
- display(df_all)
3179
+ if verbose:
3180
+ display(df_all.head())
3174
3181
  return df_all
3175
3182
  if isinstance(kind, list):
3176
3183
  f_ = []
@@ -3206,6 +3213,7 @@ def listdir(
3206
3213
  "size": [],
3207
3214
  "fname": [],
3208
3215
  "fpath": [],
3216
+ "basename":[],
3209
3217
  }
3210
3218
  for item in ls:
3211
3219
  item_path = os.path.join(rootdir, item)
@@ -3228,6 +3236,7 @@ def listdir(
3228
3236
  f["length"].append(len(filename))
3229
3237
  f["path"].append(os.path.join(os.path.dirname(item_path), item))
3230
3238
  fpath = os.path.join(os.path.dirname(item_path), item)
3239
+ basename=os.path.basename(item_path)
3231
3240
  f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
3232
3241
  f["created_time"].append(
3233
3242
  pd.to_datetime(os.path.getctime(item_path), unit="s")
@@ -3240,6 +3249,7 @@ def listdir(
3240
3249
  )
3241
3250
  f["fname"].append(filename) # will be removed
3242
3251
  f["fpath"].append(fpath) # will be removed
3252
+ f['basename'].append(basename)
3243
3253
  i += 1
3244
3254
 
3245
3255
  f["num"] = i
@@ -3462,7 +3472,6 @@ def figsave(*args, dpi=300):
3462
3472
  img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
3463
3473
  elif isinstance(img, np.ndarray):
3464
3474
  import cv2
3465
-
3466
3475
  # Check the shape of the image to determine color mode
3467
3476
  if img.ndim == 2:
3468
3477
  # Grayscale image
@@ -5055,16 +5064,22 @@ def _df_outlier(
5055
5064
  from scipy.stats import zscore
5056
5065
  from sklearn.ensemble import IsolationForest
5057
5066
  from sklearn.preprocessing import StandardScaler
5058
-
5067
+
5068
+ # Fill completely NaN columns with a default value (e.g., 0)
5069
+ data = data.copy()
5070
+ data.loc[:, data.isna().all()] = 0
5071
+ if columns is not None:
5072
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
5073
+ data=data[columns]
5059
5074
  col_names_org = data.columns.tolist()
5060
5075
  index_names_org = data.index.tolist()
5061
5076
  # Separate numeric and non-numeric columns
5062
5077
  numeric_data = data.select_dtypes(include=[np.number])
5063
5078
  non_numeric_data = data.select_dtypes(exclude=[np.number])
5064
5079
 
5065
- if columns is not None:
5066
- numeric_data = numeric_data[columns]
5067
- elif numeric_data.empty:
5080
+ # if columns is not None:
5081
+ # numeric_data = numeric_data[columns]
5082
+ if numeric_data.empty:
5068
5083
  raise ValueError("Input data must contain numeric columns.")
5069
5084
 
5070
5085
  outliers_df = pd.DataFrame(index=numeric_data.index)
@@ -5626,6 +5641,10 @@ def df_fillna(
5626
5641
  for col in data.columns:
5627
5642
  data[col] = data[col].apply(lambda x: np.nan if x is None else x)
5628
5643
 
5644
+ # Fill completely NaN columns with a default value (e.g., 0)
5645
+ data = data.copy()
5646
+ data.loc[:, data.isna().all()] = 0
5647
+
5629
5648
  col_names_org = data.columns.tolist()
5630
5649
  index_names_org = data.index.tolist()
5631
5650
  # Separate numeric and non-numeric columns
@@ -5682,11 +5701,11 @@ def df_fillna(
5682
5701
  imputed_data = imputer.fit_transform(numeric_data.T)
5683
5702
  else:
5684
5703
  raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
5685
-
5704
+
5686
5705
  imputed_data = pd.DataFrame(
5687
5706
  imputed_data if axis == 0 else imputed_data.T,
5688
- index=numeric_data.index if axis == 0 else data.columns,
5689
- columns=numeric_data.columns if axis == 0 else data.index,
5707
+ index=numeric_data.index if axis == 0 else numeric_data.columns,
5708
+ columns=numeric_data.columns if axis == 0 else numeric_data.index,
5690
5709
  )
5691
5710
  for col in imputed_data.select_dtypes(include=[np.number]).columns:
5692
5711
  imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
@@ -5826,8 +5845,13 @@ def df_encoder(
5826
5845
  from sklearn.preprocessing import LabelEncoder
5827
5846
 
5828
5847
  encoder = LabelEncoder()
5829
- encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
5830
- return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5848
+ # Apply LabelEncoder only to non-numeric columns
5849
+ non_numeric_columns = [col for col in columns if not pd.api.types.is_numeric_dtype(data[col])]
5850
+
5851
+ if not non_numeric_columns:
5852
+ return data
5853
+ encoded_data = data[non_numeric_columns].apply(lambda col: encoder.fit_transform(col))
5854
+ return pd.concat([data.drop(non_numeric_columns, axis=1), encoded_data], axis=1)
5831
5855
 
5832
5856
  # Target encoding (Mean of the target for each category)
5833
5857
  elif method == "target":
@@ -6878,7 +6902,188 @@ def df_reducer(
6878
6902
  # example:
6879
6903
  # df_reducer(data=data_log, columns=markers, n_components=2)
6880
6904
 
6905
+ def df_format(data, threshold_unique=0.5, verbose=False):
6906
+ """
6907
+ 检测表格: long, wide or uncertain.
6908
+
6909
+ Parameters:
6910
+ - data (pd.DataFrame): DataFrame to check.
6911
+ - threshold_unique (float): Proportion threshold for detecting categorical columns.
6912
+
6913
+ Returns:
6914
+ - "long" if detected as long format,
6915
+ - "wide" if detected as wide format
6916
+ - "uncertain" if ambiguous.
6917
+ """
6918
+ from scipy.stats import entropy
6919
+ from sklearn.cluster import AgglomerativeClustering
6920
+ from sklearn.preprocessing import StandardScaler
6921
+
6922
+ long_score = 0
6923
+ wide_score = 0
6924
+
6925
+ n_rows, n_cols = data.shape
6926
+
6927
+ # Step 1: Row-Column Ratio Heuristic
6928
+ if n_rows > 3 * n_cols:
6929
+ long_score += 2
6930
+ if verbose:
6931
+ print(
6932
+ "Row-Column Ratio suggests long format (many rows relative to columns)."
6933
+ )
6934
+ elif n_cols > 3 * n_rows:
6935
+ wide_score += 2
6936
+ if verbose:
6937
+ print(
6938
+ "Row-Column Ratio suggests wide format (many columns relative to rows)."
6939
+ )
6940
+
6941
+ # Step 2: Unique-to-duplicate ratio and entropy for categorical variables
6942
+ unique_counts = data.apply(lambda x: x.nunique())
6943
+ duplicate_ratio = 1 - unique_counts / n_rows
6944
+ if (duplicate_ratio > 0.2).sum() > 0.5 * n_cols:
6945
+ wide_score += 2
6946
+ if verbose:
6947
+ print("High duplicate values in columns suggest wide format.")
6948
+ else:
6949
+ long_score += 1
6950
+ if verbose:
6951
+ print(
6952
+ "Lower duplicate ratio suggests long format (higher row variability)."
6953
+ )
6881
6954
 
6955
+ # Calculate entropy for categorical columns
6956
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
6957
+ if len(categorical_cols) > 0:
6958
+ for col in categorical_cols:
6959
+ counts = data[col].value_counts(normalize=True)
6960
+ col_entropy = entropy(counts)
6961
+ if col_entropy < 1.5:
6962
+ long_score += 1
6963
+ if verbose:
6964
+ print(
6965
+ f"Column '{col}' entropy suggests categorical, supporting long format."
6966
+ )
6967
+ else:
6968
+ wide_score += 1
6969
+ if verbose:
6970
+ print(f"Column '{col}' entropy is higher, supporting wide format.")
6971
+
6972
+ # Step 3: Column grouping analysis for patterns in suffixes/prefixes
6973
+ col_names = data.columns.astype(str)
6974
+ suffix_count = sum("_" in col or col[-1].isdigit() for col in col_names)
6975
+ if suffix_count > 0.3 * n_cols:
6976
+ wide_score += 2
6977
+ if verbose:
6978
+ print(
6979
+ "Detected suffix/prefix patterns in column names, suggesting wide format."
6980
+ )
6981
+
6982
+ # Step 4: Entity identifier detection for long format with categorical columns
6983
+ if len(categorical_cols) > 0 and n_rows > n_cols:
6984
+ entity_identifier_count = sum(
6985
+ data.duplicated(subset=categorical_cols, keep=False)
6986
+ )
6987
+ if entity_identifier_count > 0.2 * n_rows:
6988
+ long_score += 2
6989
+ if verbose:
6990
+ print(
6991
+ "Significant duplicate rows based on categorical columns, suggesting long format."
6992
+ )
6993
+
6994
+ # Step 5: Clustering analysis on numerical columns for correlation in wide format
6995
+ numeric_cols = data.select_dtypes(include="number").columns
6996
+ if len(numeric_cols) > 1:
6997
+ scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
6998
+ clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
6999
+ cluster_labels = pd.Series(clustering.labels_)
7000
+ if cluster_labels.nunique() < len(numeric_cols) * 0.5:
7001
+ wide_score += 2
7002
+ if verbose:
7003
+ print("Clustering on columns shows grouping, suggesting wide format.")
7004
+
7005
+ # Step 6: Inter-column correlation analysis
7006
+ if len(numeric_cols) > 1:
7007
+ corr_matrix = data[numeric_cols].corr().abs()
7008
+ avg_corr = (
7009
+ corr_matrix.where(~np.eye(len(corr_matrix), dtype=bool)).mean().mean()
7010
+ )
7011
+ if avg_corr > 0.6:
7012
+ wide_score += 2
7013
+ if verbose:
7014
+ print("High inter-column correlation suggests wide format.")
7015
+
7016
+ # Step 7: Missing value pattern analysis
7017
+ missing_patterns = data.isna().sum(axis=1)
7018
+ if missing_patterns.std() < 2:
7019
+ wide_score += 1
7020
+ if verbose:
7021
+ print(
7022
+ "Low variation in missing patterns across rows, supporting wide format."
7023
+ )
7024
+ elif missing_patterns.mean() < 1:
7025
+ long_score += 1
7026
+ if verbose:
7027
+ print("Lower missing pattern suggests long format (less structured).")
7028
+
7029
+ # Step 8: Multi-level clustering on rows to detect block structure for wide format
7030
+ if len(numeric_cols) > 1 and n_rows > 5:
7031
+ clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
7032
+ if pd.Series(clustering_rows.labels_).nunique() < 2:
7033
+ wide_score += 2
7034
+ if verbose:
7035
+ print("Row clustering reveals homogeneity, suggesting wide format.")
7036
+
7037
+ # Step 9: Sequential name detection for time-series pattern in wide format
7038
+ if any(col.isdigit() or col.startswith("T") for col in col_names):
7039
+ wide_score += 1
7040
+ if verbose:
7041
+ print("Detected time-like sequential column names, supporting wide format.")
7042
+
7043
+ # Step 10: Entropy of numeric columns
7044
+ numeric_entropy = data[numeric_cols].apply(
7045
+ lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
7046
+ )
7047
+ if numeric_entropy.mean() < 2:
7048
+ wide_score += 2
7049
+ if verbose:
7050
+ print(
7051
+ "Low entropy in numeric columns indicates stability across columns, supporting wide format."
7052
+ )
7053
+
7054
+ # Step 11: Tie-breaking strategy if scores are equal
7055
+ if wide_score == long_score:
7056
+ if n_cols > n_rows:
7057
+ wide_score += 1
7058
+ if verbose:
7059
+ print(
7060
+ "Tie-breaking based on column-major structure, favoring wide format."
7061
+ )
7062
+ elif n_rows > n_cols:
7063
+ long_score += 1
7064
+ if verbose:
7065
+ print(
7066
+ "Tie-breaking based on row-major structure, favoring long format."
7067
+ )
7068
+ else:
7069
+ if verbose:
7070
+ print("Tie-breaking inconclusive; returning 'uncertain'.")
7071
+ return "uncertain"
7072
+
7073
+ # Final decision
7074
+ if wide_score > long_score:
7075
+ if verbose:
7076
+ print("Final decision: Wide format.")
7077
+ return "wide"
7078
+ elif long_score > wide_score:
7079
+ if verbose:
7080
+ print("Final decision: Long format.")
7081
+ return "long"
7082
+ else:
7083
+ if verbose:
7084
+ print("Final decision: Uncertain format.")
7085
+ return "uncertain"
7086
+
6882
7087
  def plot_cluster(
6883
7088
  data: pd.DataFrame,
6884
7089
  labels: np.ndarray,
@@ -7126,7 +7331,514 @@ def evaluate_cluster(
7126
7331
  metrics["V-Measure"] = np.nan
7127
7332
 
7128
7333
  return metrics
7334
+ def df_qc(
7335
+ data: pd.DataFrame,
7336
+ columns=None,
7337
+ verbose=False,
7338
+ plot_=True,
7339
+ max_cols=20, # only for plots
7340
+ output=False,
7341
+ ):
7342
+ """
7343
+ Usage example:
7344
+ df = pd.DataFrame(...) # Your DataFrameres_qc = df_qc(df)
7345
+ """
7346
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
7347
+ from scipy.stats import skew, kurtosis, entropy
7348
+ import skimpy
7349
+
7350
+ #! display(data.select_dtypes(include=[np.number]).describe())
7351
+ #!skim
7352
+ if columns is not None:
7353
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
7354
+ data=data[columns]
7355
+ try:
7356
+ skimpy.skim(data)
7357
+ except:
7358
+ numerical_data = data.select_dtypes(include=[np.number])
7359
+ skimpy.skim(numerical_data)
7360
+ # Fill completely NaN columns with a default value (e.g., 0)
7361
+ data = data.copy()
7362
+ data.loc[:, data.isna().all()] = 0
7363
+ res_qc = {}
7364
+
7365
+ # Missing values
7366
+ res_qc["missing_values"] = data.isnull().sum()
7367
+ res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
7368
+ res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
7369
+
7370
+ # Data types and unique values
7371
+ res_qc["data_types"] = data.dtypes
7372
+ res_qc["unique_values"] = data.nunique()
7373
+ res_qc["constant_columns"] = [
7374
+ col for col in data.columns if data[col].nunique() <= 1
7375
+ ]
7376
+
7377
+ # Duplicate rows and columns
7378
+ res_qc["duplicate_rows"] = data.duplicated().sum()
7379
+ res_qc["duplicate_columns"] = data.columns[data.columns.duplicated()].tolist()
7380
+
7381
+ # Empty columns
7382
+ res_qc["empty_columns"] = [col for col in data.columns if data[col].isnull().all()]
7383
+
7384
+ # outliers
7385
+ data_outliers = df_outlier(data)
7386
+ outlier_num = data_outliers.isna().sum() - data.isnull().sum()
7387
+ res_qc["outlier_num"] = outlier_num[outlier_num > 0]
7388
+ outlier_percentage=(outlier_num / len(data_outliers)) * 100
7389
+ res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
7390
+ # Correlation and multicollinearity (VIF)
7391
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7392
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7393
+ corr_matrix = numeric_df.corr()
7394
+ high_corr_pairs = [
7395
+ (col1, col2)
7396
+ for col1 in corr_matrix.columns
7397
+ for col2 in corr_matrix.columns
7398
+ if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7399
+ ]
7400
+ res_qc["high_correlations"] = high_corr_pairs
7401
+
7402
+ # VIF for multicollinearity check
7403
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7404
+ vif_data = pd.DataFrame()
7405
+ res_qc["vif"]=vif_data
7406
+ if numeric_df.shape[1] > 1:
7407
+ vif_data["feature"] = numeric_df.columns
7408
+ vif_data["VIF"] = [
7409
+ variance_inflation_factor(numeric_df.values, i)
7410
+ for i in range(numeric_df.shape[1])
7411
+ ]
7412
+ res_qc["vif"] = vif_data[
7413
+ vif_data["VIF"] > 5
7414
+ ] # Typically VIF > 5 indicates multicollinearity
7415
+ # Skewness and Kurtosis
7416
+ skewness = data.skew(numeric_only=True)
7417
+ kurtosis_vals = data.kurt(numeric_only=True)
7418
+ res_qc["skewness"] = skewness[abs(skewness) > 1]
7419
+ res_qc["kurtosis"] = kurtosis_vals[abs(kurtosis_vals) > 3]
7420
+
7421
+ # Entropy for categorical columns (higher entropy suggests more disorder)
7422
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
7423
+ res_qc["entropy_categoricals"] = {
7424
+ col: entropy(data[col].value_counts(normalize=True), base=2)
7425
+ for col in categorical_cols
7426
+ }
7427
+ # number of unique
7428
+ res_qc["unique_counts"] = data.nunique()
7429
+ # dtypes counts
7430
+ res_qc['dtype_counts']=data.dtypes.value_counts()
7431
+
7432
+ # Distribution Analysis (mean, median, mode, std dev, IQR for numeric columns)
7433
+ distribution_stats = data.select_dtypes(include=[np.number]).describe().T
7434
+ iqr = data.select_dtypes(include=[np.number]).apply(
7435
+ lambda x: x.quantile(0.75) - x.quantile(0.25)
7436
+ )
7437
+ distribution_stats["IQR"] = iqr
7438
+ res_qc["distribution_analysis"] = distribution_stats
7439
+
7440
+ # Variance Check: Identify low-variance columns
7441
+ variance_threshold = 0.01
7442
+ low_variance_cols = [
7443
+ col
7444
+ for col in data.select_dtypes(include=[np.number]).columns
7445
+ if data[col].var() < variance_threshold
7446
+ ]
7447
+ res_qc["low_variance_features"] = low_variance_cols
7448
+
7449
+ # Categorical columns and cardinality
7450
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
7451
+ high_cardinality = {
7452
+ col: data[col].nunique() for col in categorical_cols if data[col].nunique() > 50
7453
+ }
7454
+ res_qc["high_cardinality_categoricals"] = high_cardinality
7455
+
7456
+ # Feature-type inconsistency (mixed types in columns)
7457
+ inconsistent_types = {}
7458
+ for col in data.columns:
7459
+ unique_types = set(type(val) for val in data[col].dropna())
7460
+ if len(unique_types) > 1:
7461
+ inconsistent_types[col] = unique_types
7462
+ res_qc["inconsistent_types"] = inconsistent_types
7463
+
7464
+
7465
+ # Text length analysis for text fields
7466
+ text_lengths = {}
7467
+ for col in categorical_cols:
7468
+ text_lengths[col] = {
7469
+ "avg_length": data[col].dropna().apply(len).mean(),
7470
+ "length_variance": data[col].dropna().apply(len).var(),
7471
+ }
7472
+ res_qc["text_length_analysis"] = text_lengths
7473
+
7474
+ # Summary statistics
7475
+ res_qc["summary_statistics"] = data.describe().T
7476
+
7477
+ # Automated warnings
7478
+ warnings = []
7479
+ if res_qc["duplicate_rows"] > 0:
7480
+ warnings.append("Warning: Duplicate rows detected.")
7481
+ if len(res_qc["empty_columns"]) > 0:
7482
+ warnings.append("Warning: Columns with only NaN values detected.")
7483
+ if len(res_qc["constant_columns"]) > 0:
7484
+ warnings.append("Warning: Columns with a single constant value detected.")
7485
+ if len(high_corr_pairs) > 0:
7486
+ warnings.append("Warning: Highly correlated columns detected.")
7487
+ if len(res_qc["vif"]) > 0:
7488
+ warnings.append("Warning: Multicollinearity detected in features.")
7489
+ if len(high_cardinality) > 0:
7490
+ warnings.append("Warning: High cardinality in categorical columns.")
7491
+ if len(inconsistent_types) > 0:
7492
+ warnings.append("Warning: Columns with mixed data types detected.")
7493
+ res_qc["warnings"] = warnings
7494
+
7495
+ # Report generation
7496
+ if verbose:
7497
+ print("=== QC Report Summary ===")
7498
+ print("\nMissing Values (Total and %):")
7499
+ print(res_qc["missing_values"][res_qc["missing_values"] > 0])
7500
+ print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7501
+
7502
+ print("\nRows with Missing Values:", res_qc["rows_with_missing"])
7503
+
7504
+ print("\nData Types:")
7505
+ print(res_qc["data_types"])
7506
+
7507
+ print("\nUnique Values per Column:")
7508
+ print(res_qc["unique_values"])
7509
+
7510
+ print("\nConstant Columns:", res_qc["constant_columns"])
7511
+
7512
+ print("\nDuplicate Rows:", res_qc["duplicate_rows"])
7513
+ print("Duplicate Columns:", res_qc["duplicate_columns"])
7514
+
7515
+ if res_qc["empty_columns"]:
7516
+ print("\nEmpty Columns:", res_qc["empty_columns"])
7517
+
7518
+ print("\nOutlier Report:")
7519
+ print(res_qc["outlier_num"])
7520
+ print("\nPercentage of Values Replaced per Column:")
7521
+ print(res_qc["outlier_percentage"])
7522
+
7523
+ print("\nHigh Correlations (>|0.9|):")
7524
+ for col1, col2 in res_qc["high_correlations"]:
7525
+ print(f" {col1} and {col2}")
7526
+
7527
+ if "vif" in res_qc:
7528
+ print("\nFeatures with High VIF (>|5|):")
7529
+ print(res_qc["vif"])
7530
+
7531
+ print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
7532
+ print(res_qc["high_cardinality_categoricals"])
7533
+
7534
+ print("\nInconsistent Data Types:")
7535
+ print(res_qc["inconsistent_types"])
7536
+
7537
+ print("\nRange Checks for Numeric Columns:")
7538
+ print(res_qc["range_checks"])
7539
+
7540
+ print("\nText Length Analysis:")
7541
+ for col, stats in res_qc["text_length_analysis"].items():
7542
+ print(
7543
+ f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
7544
+ )
7545
+
7546
+ print("\nSummary Statistics:")
7547
+ print(res_qc["summary_statistics"])
7129
7548
 
7549
+ if res_qc["warnings"]:
7550
+ print("\nWarnings:")
7551
+ for warning in res_qc["warnings"]:
7552
+ print(" -", warning)
7553
+ if plot_:
7554
+ df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
7555
+ if output:
7556
+ return res_qc
7557
+ return None
7558
+
7559
+
7560
+ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
7561
+ import matplotlib.pyplot as plt
7562
+ import seaborn as sns
7563
+ from .plot import subplot, figsets, get_color
7564
+
7565
+ if columns is not None:
7566
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
7567
+ data=data[columns]
7568
+ len_total = len(res_qc)
7569
+ n_row, n_col = int((len_total + 10) / 3), 3
7570
+ nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7571
+
7572
+ missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
7573
+ ascending=False
7574
+ )
7575
+ if len(missing_data) > max_cols:
7576
+ missing_data = missing_data[:max_cols]
7577
+ ax=sns.barplot(
7578
+ x=missing_data.index,
7579
+ y=missing_data.values,
7580
+ hue=missing_data.index,
7581
+ palette=get_color(len(missing_data), cmap="Blues")[::-1],
7582
+ ax=nexttile(),
7583
+ )
7584
+ figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
7585
+
7586
+ ax2 = ax.twinx()
7587
+ # Plot missing value percentages
7588
+ missing_percentage = res_qc["missing_percentage"][
7589
+ res_qc["missing_percentage"] > 0
7590
+ ].sort_values(ascending=False)
7591
+ sns.barplot(
7592
+ x=missing_percentage.index,
7593
+ y=missing_percentage.values,
7594
+ hue=missing_percentage.index,
7595
+ palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
7596
+ ax=ax2,#nexttile(),
7597
+ )
7598
+ figsets(xangle=45, ylabel="%",ax=ax2)
7599
+ ax2.tick_params(axis="y", color='r',labelcolor='r')
7600
+ ax2.yaxis.label.set_color('r')
7601
+
7602
+ outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
7603
+ if len(outlier_num) > max_cols:
7604
+ outlier_num = outlier_num[:max_cols]
7605
+ ax_outlier_num=sns.barplot(
7606
+ x=outlier_num.index,
7607
+ y=outlier_num.values,
7608
+ hue=outlier_num.index,
7609
+ palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
7610
+ ax=nexttile(),
7611
+ )
7612
+ figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
7613
+ ax_outlier_percentage = ax_outlier_num.twinx()
7614
+ outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
7615
+ if len(outlier_percentage) > max_cols:
7616
+ outlier_percentage = outlier_percentage[:max_cols]
7617
+ ax_outlier_percentage=sns.barplot(
7618
+ x=outlier_percentage.index,
7619
+ y=outlier_percentage.values,
7620
+ hue=outlier_percentage.index,
7621
+ palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
7622
+ ax=ax2 #nexttile(),
7623
+ )
7624
+ figsets(
7625
+ xangle=45,
7626
+ ylabel="%",
7627
+ xlabel=None,
7628
+ ylim=[0, outlier_percentage.max() + 2],
7629
+ ax=ax_outlier_percentage
7630
+ )
7631
+ ax2.tick_params(axis="y", color='r',labelcolor='r')
7632
+ ax2.yaxis.label.set_color('r')
7633
+
7634
+ # Skewness and Kurtosis Plots
7635
+ skewness = res_qc["skewness"].sort_values(ascending=False)
7636
+ kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
7637
+ if not skewness.empty:
7638
+ ax_skewness=sns.barplot(
7639
+ x=skewness.index,
7640
+ y=skewness.values,
7641
+ hue=skewness.index,
7642
+ palette=get_color(len(skewness), cmap="coolwarm")[::-1],
7643
+ ax=nexttile(),
7644
+ )
7645
+ figsets(
7646
+ xangle=45,
7647
+ title="Highly Skewed Numeric Columns (Skewness > 1)",
7648
+ ylabel="Skewness",xlabel=None,ax=ax_skewness
7649
+ )
7650
+ if not kurtosis.empty:
7651
+ ax_kurtosis=sns.barplot(
7652
+ x=kurtosis.index,
7653
+ y=kurtosis.values,
7654
+ hue=kurtosis.index,
7655
+ palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
7656
+ ax=nexttile(),
7657
+ )
7658
+ figsets(
7659
+ xangle=45,
7660
+ title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
7661
+ ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
7662
+ )
7663
+
7664
+ # Entropy for Categorical Variables
7665
+ entropy_data = pd.Series(res_qc["entropy_categoricals"]).sort_values(
7666
+ ascending=False
7667
+ )
7668
+ ax_entropy_data=sns.barplot(
7669
+ x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
7670
+ )
7671
+ figsets(
7672
+ xangle=45,
7673
+ xlabel="Categorical Columns",
7674
+ title="Entropy of Categorical Variables",
7675
+ ylabel="Entropy (bits)",
7676
+ ax=ax_entropy_data
7677
+ )
7678
+ # Distribution Analysis: Boxplot for IQR
7679
+ ax_iqr=sns.boxplot(
7680
+ data=data[res_qc["distribution_analysis"].index],
7681
+ orient="v",
7682
+ palette="Set3",
7683
+ ax=nexttile(),
7684
+ )
7685
+ figsets(
7686
+ xangle=45,
7687
+ title="Range for Numeric Columns",
7688
+ ylabel="#",
7689
+ ax=ax_iqr
7690
+ )
7691
+ # unique counts
7692
+ unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
7693
+ ax_unique_counts_=sns.barplot(
7694
+ x=unique_counts.index,
7695
+ y=unique_counts.values,
7696
+ hue=unique_counts.index,
7697
+ palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
7698
+ ax=nexttile())
7699
+ figsets(
7700
+ xangle=45,
7701
+ title="Unique Counts",
7702
+ xlabel=None,
7703
+ ylabel="#",
7704
+ ax=ax_unique_counts_
7705
+ )
7706
+ # Binary Checking
7707
+ ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
7708
+ y=unique_counts[unique_counts<10].values,
7709
+ hue=unique_counts[unique_counts<10].index,
7710
+ palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
7711
+ ax=nexttile())
7712
+ plt.axhline(y=2, color="r", linestyle="--", lw=2)
7713
+ figsets(
7714
+ xangle=45,
7715
+ xlabel=None,
7716
+ title="Binary Checking",
7717
+ ylabel="#",
7718
+ ax=ax_unique_counts
7719
+ )
7720
+
7721
+ # dtypes counts
7722
+ dtype_counts = res_qc['dtype_counts']
7723
+ txt = []
7724
+ for tp in dtype_counts.index:
7725
+ txt.append(list(data.select_dtypes(include=tp).columns))
7726
+
7727
+ ax_dtype_counts = sns.barplot(
7728
+ x=dtype_counts.index,
7729
+ y=dtype_counts.values,
7730
+ color="#F3C8B2",
7731
+ ax=nexttile(),
7732
+ )
7733
+ max_columns_per_row = 1 # Maximum number of columns per row
7734
+ for i, tp in enumerate(dtype_counts.index):
7735
+ if i<=20:
7736
+ column_names = txt[i]
7737
+ # Split the column names into multiple lines if too long
7738
+ column_name_str = ", ".join(column_names)
7739
+ if len(column_name_str) > 40: # If column names are too long, split them
7740
+ column_name_str = "\n".join(
7741
+ [
7742
+ ", ".join(column_names[j : j + max_columns_per_row])
7743
+ for j in range(0, len(column_names), max_columns_per_row)
7744
+ ]
7745
+ )
7746
+ # Place text annotation with line breaks and rotate the text if needed
7747
+ ax_dtype_counts.text(
7748
+ i,
7749
+ dtype_counts.values[i],
7750
+ f"{column_name_str}",
7751
+ ha="center",
7752
+ va="top",
7753
+ c="k",
7754
+ fontsize=8,
7755
+ rotation=0,
7756
+ )
7757
+ figsets(
7758
+ xlabel=None,
7759
+ title="Dtypes",
7760
+ ylabel="#",
7761
+ ax=ax_dtype_counts
7762
+ )
7763
+
7764
+ # High cardinality: Show top categorical columns by unique value count
7765
+ high_cardinality = res_qc["high_cardinality_categoricals"]
7766
+ if high_cardinality and len(high_cardinality) > max_cols:
7767
+ high_cardinality = dict(
7768
+ sorted(high_cardinality.items(), key=lambda x: x[1], reverse=True)[
7769
+ :max_cols
7770
+ ]
7771
+ )
7772
+
7773
+ if high_cardinality:
7774
+ ax_high_cardinality=sns.barplot(
7775
+ x=list(high_cardinality.keys()),
7776
+ y=list(high_cardinality.values()),
7777
+ hue=list(high_cardinality.keys()),
7778
+ palette="Oranges", ax=nexttile()
7779
+ )
7780
+ figsets(
7781
+ xangle=45,
7782
+ title="High Cardinality Categorical Columns",
7783
+ ylabel="Unique Value Count",
7784
+ ax=ax_high_cardinality
7785
+ )
7786
+ if res_qc["low_variance_features"]:
7787
+ low_variance_data = data[res_qc["low_variance_features"]].copy()
7788
+ for col in low_variance_data.columns:
7789
+ sns.histplot(
7790
+ low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
7791
+ )
7792
+ plt.title(f"Low Variance Feature: {col}")
7793
+
7794
+ # VIF plot for multicollinearity detection
7795
+ if "vif" in res_qc and not res_qc["vif"].empty:
7796
+ vif_data = res_qc["vif"].sort_values(by="VIF", ascending=False)
7797
+ if len(vif_data) > max_cols:
7798
+ vif_data = vif_data[:max_cols]
7799
+ ax_vif=sns.barplot(data=vif_data,
7800
+ x="VIF",
7801
+ y="feature",
7802
+ hue="VIF",
7803
+ palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
7804
+ ax=nexttile())
7805
+ figsets(
7806
+ xangle=45,
7807
+ title="Variance Inflation Factor(VIF)",
7808
+ xlabel="Variance Inflation Factor(VIF)",
7809
+ ylabel="Features",
7810
+ legend=None,
7811
+ ax=ax_vif
7812
+ )
7813
+
7814
+ # Correlation heatmap for numeric columns with high correlation pairs
7815
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7816
+ corr = data.select_dtypes(include=[np.number]).dropna().corr()
7817
+ if corr.shape[1]<=33:
7818
+ mask = np.triu(np.ones_like(corr, dtype=bool))
7819
+ # Dynamically scale fontsize based on the number of columns
7820
+ num_columns = corr.shape[1]
7821
+ fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
7822
+
7823
+ ax_heatmap=sns.heatmap(
7824
+ corr,
7825
+ mask=mask,
7826
+ annot=True,
7827
+ cmap="coolwarm",
7828
+ center=0,
7829
+ fmt=".2f",
7830
+ linewidths=0.5,
7831
+ vmin=-1, vmax=1,
7832
+ ax=nexttile(2, 2),
7833
+ cbar_kws=dict(shrink=0.2,ticks=np.arange(-1, 2, 1)),
7834
+ annot_kws={"size": fontsize}
7835
+ )
7836
+
7837
+ figsets(
7838
+ xangle=45,
7839
+ title="Correlation Heatmap",
7840
+ ax=ax_heatmap
7841
+ )
7130
7842
 
7131
7843
  def use_pd(
7132
7844
  func_name="excel",