py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -2171,6 +2171,8 @@ def fload(fpath, kind=None, **kwargs):
2171
2171
  continue
2172
2172
  else:
2173
2173
  pass
2174
+ if is_df_abnormal(df,verbose=verbose):
2175
+ df=pd.read_csv(fpath,**kwargs)
2174
2176
  display(df.head(2))
2175
2177
  print(f"shape: {df.shape}")
2176
2178
  return df
@@ -3163,14 +3165,19 @@ def listdir(
3163
3165
  if kind is None:
3164
3166
  ls = os.listdir(rootdir)
3165
3167
  ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
3166
- print(ls)
3168
+ if verbose:
3169
+ if len(ls)>20:
3170
+ print(ls[:20])
3171
+ else:
3172
+ print(ls)
3167
3173
  df_all = pd.DataFrame(
3168
3174
  {
3169
3175
  "fname": ls,
3170
3176
  "fpath": [os.path.join(rootdir, i) for i in ls],
3171
3177
  }
3172
3178
  )
3173
- display(df_all)
3179
+ if verbose:
3180
+ display(df_all.head())
3174
3181
  return df_all
3175
3182
  if isinstance(kind, list):
3176
3183
  f_ = []
@@ -3206,6 +3213,7 @@ def listdir(
3206
3213
  "size": [],
3207
3214
  "fname": [],
3208
3215
  "fpath": [],
3216
+ "basename":[],
3209
3217
  }
3210
3218
  for item in ls:
3211
3219
  item_path = os.path.join(rootdir, item)
@@ -3228,6 +3236,7 @@ def listdir(
3228
3236
  f["length"].append(len(filename))
3229
3237
  f["path"].append(os.path.join(os.path.dirname(item_path), item))
3230
3238
  fpath = os.path.join(os.path.dirname(item_path), item)
3239
+ basename=os.path.basename(item_path)
3231
3240
  f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
3232
3241
  f["created_time"].append(
3233
3242
  pd.to_datetime(os.path.getctime(item_path), unit="s")
@@ -3240,6 +3249,7 @@ def listdir(
3240
3249
  )
3241
3250
  f["fname"].append(filename) # will be removed
3242
3251
  f["fpath"].append(fpath) # will be removed
3252
+ f['basename'].append(basename)
3243
3253
  i += 1
3244
3254
 
3245
3255
  f["num"] = i
@@ -3462,7 +3472,6 @@ def figsave(*args, dpi=300):
3462
3472
  img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
3463
3473
  elif isinstance(img, np.ndarray):
3464
3474
  import cv2
3465
-
3466
3475
  # Check the shape of the image to determine color mode
3467
3476
  if img.ndim == 2:
3468
3477
  # Grayscale image
@@ -5055,16 +5064,22 @@ def _df_outlier(
5055
5064
  from scipy.stats import zscore
5056
5065
  from sklearn.ensemble import IsolationForest
5057
5066
  from sklearn.preprocessing import StandardScaler
5058
-
5067
+
5068
+ # Fill completely NaN columns with a default value (e.g., 0)
5069
+ data = data.copy()
5070
+ data.loc[:, data.isna().all()] = 0
5071
+ if columns is not None:
5072
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
5073
+ data=data[columns]
5059
5074
  col_names_org = data.columns.tolist()
5060
5075
  index_names_org = data.index.tolist()
5061
5076
  # Separate numeric and non-numeric columns
5062
5077
  numeric_data = data.select_dtypes(include=[np.number])
5063
5078
  non_numeric_data = data.select_dtypes(exclude=[np.number])
5064
5079
 
5065
- if columns is not None:
5066
- numeric_data = numeric_data[columns]
5067
- elif numeric_data.empty:
5080
+ # if columns is not None:
5081
+ # numeric_data = numeric_data[columns]
5082
+ if numeric_data.empty:
5068
5083
  raise ValueError("Input data must contain numeric columns.")
5069
5084
 
5070
5085
  outliers_df = pd.DataFrame(index=numeric_data.index)
@@ -5626,6 +5641,10 @@ def df_fillna(
5626
5641
  for col in data.columns:
5627
5642
  data[col] = data[col].apply(lambda x: np.nan if x is None else x)
5628
5643
 
5644
+ # Fill completely NaN columns with a default value (e.g., 0)
5645
+ data = data.copy()
5646
+ data.loc[:, data.isna().all()] = 0
5647
+
5629
5648
  col_names_org = data.columns.tolist()
5630
5649
  index_names_org = data.index.tolist()
5631
5650
  # Separate numeric and non-numeric columns
@@ -5682,11 +5701,11 @@ def df_fillna(
5682
5701
  imputed_data = imputer.fit_transform(numeric_data.T)
5683
5702
  else:
5684
5703
  raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
5685
-
5704
+
5686
5705
  imputed_data = pd.DataFrame(
5687
5706
  imputed_data if axis == 0 else imputed_data.T,
5688
- index=numeric_data.index if axis == 0 else data.columns,
5689
- columns=numeric_data.columns if axis == 0 else data.index,
5707
+ index=numeric_data.index if axis == 0 else numeric_data.columns,
5708
+ columns=numeric_data.columns if axis == 0 else numeric_data.index,
5690
5709
  )
5691
5710
  for col in imputed_data.select_dtypes(include=[np.number]).columns:
5692
5711
  imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
@@ -5826,8 +5845,13 @@ def df_encoder(
5826
5845
  from sklearn.preprocessing import LabelEncoder
5827
5846
 
5828
5847
  encoder = LabelEncoder()
5829
- encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
5830
- return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5848
+ # Apply LabelEncoder only to non-numeric columns
5849
+ non_numeric_columns = [col for col in columns if not pd.api.types.is_numeric_dtype(data[col])]
5850
+
5851
+ if not non_numeric_columns:
5852
+ return data
5853
+ encoded_data = data[non_numeric_columns].apply(lambda col: encoder.fit_transform(col))
5854
+ return pd.concat([data.drop(non_numeric_columns, axis=1), encoded_data], axis=1)
5831
5855
 
5832
5856
  # Target encoding (Mean of the target for each category)
5833
5857
  elif method == "target":
@@ -6878,7 +6902,188 @@ def df_reducer(
6878
6902
  # example:
6879
6903
  # df_reducer(data=data_log, columns=markers, n_components=2)
6880
6904
 
6905
+ def df_format(data, threshold_unique=0.5, verbose=False):
6906
+ """
6907
+ 检测表格: long, wide or uncertain.
6908
+
6909
+ Parameters:
6910
+ - data (pd.DataFrame): DataFrame to check.
6911
+ - threshold_unique (float): Proportion threshold for detecting categorical columns.
6912
+
6913
+ Returns:
6914
+ - "long" if detected as long format,
6915
+ - "wide" if detected as wide format
6916
+ - "uncertain" if ambiguous.
6917
+ """
6918
+ from scipy.stats import entropy
6919
+ from sklearn.cluster import AgglomerativeClustering
6920
+ from sklearn.preprocessing import StandardScaler
6921
+
6922
+ long_score = 0
6923
+ wide_score = 0
6924
+
6925
+ n_rows, n_cols = data.shape
6926
+
6927
+ # Step 1: Row-Column Ratio Heuristic
6928
+ if n_rows > 3 * n_cols:
6929
+ long_score += 2
6930
+ if verbose:
6931
+ print(
6932
+ "Row-Column Ratio suggests long format (many rows relative to columns)."
6933
+ )
6934
+ elif n_cols > 3 * n_rows:
6935
+ wide_score += 2
6936
+ if verbose:
6937
+ print(
6938
+ "Row-Column Ratio suggests wide format (many columns relative to rows)."
6939
+ )
6940
+
6941
+ # Step 2: Unique-to-duplicate ratio and entropy for categorical variables
6942
+ unique_counts = data.apply(lambda x: x.nunique())
6943
+ duplicate_ratio = 1 - unique_counts / n_rows
6944
+ if (duplicate_ratio > 0.2).sum() > 0.5 * n_cols:
6945
+ wide_score += 2
6946
+ if verbose:
6947
+ print("High duplicate values in columns suggest wide format.")
6948
+ else:
6949
+ long_score += 1
6950
+ if verbose:
6951
+ print(
6952
+ "Lower duplicate ratio suggests long format (higher row variability)."
6953
+ )
6881
6954
 
6955
+ # Calculate entropy for categorical columns
6956
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
6957
+ if len(categorical_cols) > 0:
6958
+ for col in categorical_cols:
6959
+ counts = data[col].value_counts(normalize=True)
6960
+ col_entropy = entropy(counts)
6961
+ if col_entropy < 1.5:
6962
+ long_score += 1
6963
+ if verbose:
6964
+ print(
6965
+ f"Column '{col}' entropy suggests categorical, supporting long format."
6966
+ )
6967
+ else:
6968
+ wide_score += 1
6969
+ if verbose:
6970
+ print(f"Column '{col}' entropy is higher, supporting wide format.")
6971
+
6972
+ # Step 3: Column grouping analysis for patterns in suffixes/prefixes
6973
+ col_names = data.columns.astype(str)
6974
+ suffix_count = sum("_" in col or col[-1].isdigit() for col in col_names)
6975
+ if suffix_count > 0.3 * n_cols:
6976
+ wide_score += 2
6977
+ if verbose:
6978
+ print(
6979
+ "Detected suffix/prefix patterns in column names, suggesting wide format."
6980
+ )
6981
+
6982
+ # Step 4: Entity identifier detection for long format with categorical columns
6983
+ if len(categorical_cols) > 0 and n_rows > n_cols:
6984
+ entity_identifier_count = sum(
6985
+ data.duplicated(subset=categorical_cols, keep=False)
6986
+ )
6987
+ if entity_identifier_count > 0.2 * n_rows:
6988
+ long_score += 2
6989
+ if verbose:
6990
+ print(
6991
+ "Significant duplicate rows based on categorical columns, suggesting long format."
6992
+ )
6993
+
6994
+ # Step 5: Clustering analysis on numerical columns for correlation in wide format
6995
+ numeric_cols = data.select_dtypes(include="number").columns
6996
+ if len(numeric_cols) > 1:
6997
+ scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
6998
+ clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
6999
+ cluster_labels = pd.Series(clustering.labels_)
7000
+ if cluster_labels.nunique() < len(numeric_cols) * 0.5:
7001
+ wide_score += 2
7002
+ if verbose:
7003
+ print("Clustering on columns shows grouping, suggesting wide format.")
7004
+
7005
+ # Step 6: Inter-column correlation analysis
7006
+ if len(numeric_cols) > 1:
7007
+ corr_matrix = data[numeric_cols].corr().abs()
7008
+ avg_corr = (
7009
+ corr_matrix.where(~np.eye(len(corr_matrix), dtype=bool)).mean().mean()
7010
+ )
7011
+ if avg_corr > 0.6:
7012
+ wide_score += 2
7013
+ if verbose:
7014
+ print("High inter-column correlation suggests wide format.")
7015
+
7016
+ # Step 7: Missing value pattern analysis
7017
+ missing_patterns = data.isna().sum(axis=1)
7018
+ if missing_patterns.std() < 2:
7019
+ wide_score += 1
7020
+ if verbose:
7021
+ print(
7022
+ "Low variation in missing patterns across rows, supporting wide format."
7023
+ )
7024
+ elif missing_patterns.mean() < 1:
7025
+ long_score += 1
7026
+ if verbose:
7027
+ print("Lower missing pattern suggests long format (less structured).")
7028
+
7029
+ # Step 8: Multi-level clustering on rows to detect block structure for wide format
7030
+ if len(numeric_cols) > 1 and n_rows > 5:
7031
+ clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
7032
+ if pd.Series(clustering_rows.labels_).nunique() < 2:
7033
+ wide_score += 2
7034
+ if verbose:
7035
+ print("Row clustering reveals homogeneity, suggesting wide format.")
7036
+
7037
+ # Step 9: Sequential name detection for time-series pattern in wide format
7038
+ if any(col.isdigit() or col.startswith("T") for col in col_names):
7039
+ wide_score += 1
7040
+ if verbose:
7041
+ print("Detected time-like sequential column names, supporting wide format.")
7042
+
7043
+ # Step 10: Entropy of numeric columns
7044
+ numeric_entropy = data[numeric_cols].apply(
7045
+ lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
7046
+ )
7047
+ if numeric_entropy.mean() < 2:
7048
+ wide_score += 2
7049
+ if verbose:
7050
+ print(
7051
+ "Low entropy in numeric columns indicates stability across columns, supporting wide format."
7052
+ )
7053
+
7054
+ # Step 11: Tie-breaking strategy if scores are equal
7055
+ if wide_score == long_score:
7056
+ if n_cols > n_rows:
7057
+ wide_score += 1
7058
+ if verbose:
7059
+ print(
7060
+ "Tie-breaking based on column-major structure, favoring wide format."
7061
+ )
7062
+ elif n_rows > n_cols:
7063
+ long_score += 1
7064
+ if verbose:
7065
+ print(
7066
+ "Tie-breaking based on row-major structure, favoring long format."
7067
+ )
7068
+ else:
7069
+ if verbose:
7070
+ print("Tie-breaking inconclusive; returning 'uncertain'.")
7071
+ return "uncertain"
7072
+
7073
+ # Final decision
7074
+ if wide_score > long_score:
7075
+ if verbose:
7076
+ print("Final decision: Wide format.")
7077
+ return "wide"
7078
+ elif long_score > wide_score:
7079
+ if verbose:
7080
+ print("Final decision: Long format.")
7081
+ return "long"
7082
+ else:
7083
+ if verbose:
7084
+ print("Final decision: Uncertain format.")
7085
+ return "uncertain"
7086
+
6882
7087
  def plot_cluster(
6883
7088
  data: pd.DataFrame,
6884
7089
  labels: np.ndarray,
@@ -7126,7 +7331,514 @@ def evaluate_cluster(
7126
7331
  metrics["V-Measure"] = np.nan
7127
7332
 
7128
7333
  return metrics
7334
+ def df_qc(
7335
+ data: pd.DataFrame,
7336
+ columns=None,
7337
+ verbose=False,
7338
+ plot_=True,
7339
+ max_cols=20, # only for plots
7340
+ output=False,
7341
+ ):
7342
+ """
7343
+ Usage example:
7344
+ df = pd.DataFrame(...) # Your DataFrameres_qc = df_qc(df)
7345
+ """
7346
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
7347
+ from scipy.stats import skew, kurtosis, entropy
7348
+ import skimpy
7349
+
7350
+ #! display(data.select_dtypes(include=[np.number]).describe())
7351
+ #!skim
7352
+ if columns is not None:
7353
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
7354
+ data=data[columns]
7355
+ try:
7356
+ skimpy.skim(data)
7357
+ except:
7358
+ numerical_data = data.select_dtypes(include=[np.number])
7359
+ skimpy.skim(numerical_data)
7360
+ # Fill completely NaN columns with a default value (e.g., 0)
7361
+ data = data.copy()
7362
+ data.loc[:, data.isna().all()] = 0
7363
+ res_qc = {}
7364
+
7365
+ # Missing values
7366
+ res_qc["missing_values"] = data.isnull().sum()
7367
+ res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
7368
+ res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
7369
+
7370
+ # Data types and unique values
7371
+ res_qc["data_types"] = data.dtypes
7372
+ res_qc["unique_values"] = data.nunique()
7373
+ res_qc["constant_columns"] = [
7374
+ col for col in data.columns if data[col].nunique() <= 1
7375
+ ]
7376
+
7377
+ # Duplicate rows and columns
7378
+ res_qc["duplicate_rows"] = data.duplicated().sum()
7379
+ res_qc["duplicate_columns"] = data.columns[data.columns.duplicated()].tolist()
7380
+
7381
+ # Empty columns
7382
+ res_qc["empty_columns"] = [col for col in data.columns if data[col].isnull().all()]
7383
+
7384
+ # outliers
7385
+ data_outliers = df_outlier(data)
7386
+ outlier_num = data_outliers.isna().sum() - data.isnull().sum()
7387
+ res_qc["outlier_num"] = outlier_num[outlier_num > 0]
7388
+ outlier_percentage=(outlier_num / len(data_outliers)) * 100
7389
+ res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
7390
+ # Correlation and multicollinearity (VIF)
7391
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7392
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7393
+ corr_matrix = numeric_df.corr()
7394
+ high_corr_pairs = [
7395
+ (col1, col2)
7396
+ for col1 in corr_matrix.columns
7397
+ for col2 in corr_matrix.columns
7398
+ if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7399
+ ]
7400
+ res_qc["high_correlations"] = high_corr_pairs
7401
+
7402
+ # VIF for multicollinearity check
7403
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7404
+ vif_data = pd.DataFrame()
7405
+ res_qc["vif"]=vif_data
7406
+ if numeric_df.shape[1] > 1:
7407
+ vif_data["feature"] = numeric_df.columns
7408
+ vif_data["VIF"] = [
7409
+ variance_inflation_factor(numeric_df.values, i)
7410
+ for i in range(numeric_df.shape[1])
7411
+ ]
7412
+ res_qc["vif"] = vif_data[
7413
+ vif_data["VIF"] > 5
7414
+ ] # Typically VIF > 5 indicates multicollinearity
7415
+ # Skewness and Kurtosis
7416
+ skewness = data.skew(numeric_only=True)
7417
+ kurtosis_vals = data.kurt(numeric_only=True)
7418
+ res_qc["skewness"] = skewness[abs(skewness) > 1]
7419
+ res_qc["kurtosis"] = kurtosis_vals[abs(kurtosis_vals) > 3]
7420
+
7421
+ # Entropy for categorical columns (higher entropy suggests more disorder)
7422
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
7423
+ res_qc["entropy_categoricals"] = {
7424
+ col: entropy(data[col].value_counts(normalize=True), base=2)
7425
+ for col in categorical_cols
7426
+ }
7427
+ # number of unique
7428
+ res_qc["unique_counts"] = data.nunique()
7429
+ # dtypes counts
7430
+ res_qc['dtype_counts']=data.dtypes.value_counts()
7431
+
7432
+ # Distribution Analysis (mean, median, mode, std dev, IQR for numeric columns)
7433
+ distribution_stats = data.select_dtypes(include=[np.number]).describe().T
7434
+ iqr = data.select_dtypes(include=[np.number]).apply(
7435
+ lambda x: x.quantile(0.75) - x.quantile(0.25)
7436
+ )
7437
+ distribution_stats["IQR"] = iqr
7438
+ res_qc["distribution_analysis"] = distribution_stats
7439
+
7440
+ # Variance Check: Identify low-variance columns
7441
+ variance_threshold = 0.01
7442
+ low_variance_cols = [
7443
+ col
7444
+ for col in data.select_dtypes(include=[np.number]).columns
7445
+ if data[col].var() < variance_threshold
7446
+ ]
7447
+ res_qc["low_variance_features"] = low_variance_cols
7448
+
7449
+ # Categorical columns and cardinality
7450
+ categorical_cols = data.select_dtypes(include=["object", "category"]).columns
7451
+ high_cardinality = {
7452
+ col: data[col].nunique() for col in categorical_cols if data[col].nunique() > 50
7453
+ }
7454
+ res_qc["high_cardinality_categoricals"] = high_cardinality
7455
+
7456
+ # Feature-type inconsistency (mixed types in columns)
7457
+ inconsistent_types = {}
7458
+ for col in data.columns:
7459
+ unique_types = set(type(val) for val in data[col].dropna())
7460
+ if len(unique_types) > 1:
7461
+ inconsistent_types[col] = unique_types
7462
+ res_qc["inconsistent_types"] = inconsistent_types
7463
+
7464
+
7465
+ # Text length analysis for text fields
7466
+ text_lengths = {}
7467
+ for col in categorical_cols:
7468
+ text_lengths[col] = {
7469
+ "avg_length": data[col].dropna().apply(len).mean(),
7470
+ "length_variance": data[col].dropna().apply(len).var(),
7471
+ }
7472
+ res_qc["text_length_analysis"] = text_lengths
7473
+
7474
+ # Summary statistics
7475
+ res_qc["summary_statistics"] = data.describe().T
7476
+
7477
+ # Automated warnings
7478
+ warnings = []
7479
+ if res_qc["duplicate_rows"] > 0:
7480
+ warnings.append("Warning: Duplicate rows detected.")
7481
+ if len(res_qc["empty_columns"]) > 0:
7482
+ warnings.append("Warning: Columns with only NaN values detected.")
7483
+ if len(res_qc["constant_columns"]) > 0:
7484
+ warnings.append("Warning: Columns with a single constant value detected.")
7485
+ if len(high_corr_pairs) > 0:
7486
+ warnings.append("Warning: Highly correlated columns detected.")
7487
+ if len(res_qc["vif"]) > 0:
7488
+ warnings.append("Warning: Multicollinearity detected in features.")
7489
+ if len(high_cardinality) > 0:
7490
+ warnings.append("Warning: High cardinality in categorical columns.")
7491
+ if len(inconsistent_types) > 0:
7492
+ warnings.append("Warning: Columns with mixed data types detected.")
7493
+ res_qc["warnings"] = warnings
7494
+
7495
+ # Report generation
7496
+ if verbose:
7497
+ print("=== QC Report Summary ===")
7498
+ print("\nMissing Values (Total and %):")
7499
+ print(res_qc["missing_values"][res_qc["missing_values"] > 0])
7500
+ print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7501
+
7502
+ print("\nRows with Missing Values:", res_qc["rows_with_missing"])
7503
+
7504
+ print("\nData Types:")
7505
+ print(res_qc["data_types"])
7506
+
7507
+ print("\nUnique Values per Column:")
7508
+ print(res_qc["unique_values"])
7509
+
7510
+ print("\nConstant Columns:", res_qc["constant_columns"])
7511
+
7512
+ print("\nDuplicate Rows:", res_qc["duplicate_rows"])
7513
+ print("Duplicate Columns:", res_qc["duplicate_columns"])
7514
+
7515
+ if res_qc["empty_columns"]:
7516
+ print("\nEmpty Columns:", res_qc["empty_columns"])
7517
+
7518
+ print("\nOutlier Report:")
7519
+ print(res_qc["outlier_num"])
7520
+ print("\nPercentage of Values Replaced per Column:")
7521
+ print(res_qc["outlier_percentage"])
7522
+
7523
+ print("\nHigh Correlations (>|0.9|):")
7524
+ for col1, col2 in res_qc["high_correlations"]:
7525
+ print(f" {col1} and {col2}")
7526
+
7527
+ if "vif" in res_qc:
7528
+ print("\nFeatures with High VIF (>|5|):")
7529
+ print(res_qc["vif"])
7530
+
7531
+ print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
7532
+ print(res_qc["high_cardinality_categoricals"])
7533
+
7534
+ print("\nInconsistent Data Types:")
7535
+ print(res_qc["inconsistent_types"])
7536
+
7537
+ print("\nRange Checks for Numeric Columns:")
7538
+ print(res_qc["range_checks"])
7539
+
7540
+ print("\nText Length Analysis:")
7541
+ for col, stats in res_qc["text_length_analysis"].items():
7542
+ print(
7543
+ f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
7544
+ )
7545
+
7546
+ print("\nSummary Statistics:")
7547
+ print(res_qc["summary_statistics"])
7129
7548
 
7549
+ if res_qc["warnings"]:
7550
+ print("\nWarnings:")
7551
+ for warning in res_qc["warnings"]:
7552
+ print(" -", warning)
7553
+ if plot_:
7554
+ df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
7555
+ if output:
7556
+ return res_qc
7557
+ return None
7558
+
7559
+
7560
+ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
7561
+ import matplotlib.pyplot as plt
7562
+ import seaborn as sns
7563
+ from .plot import subplot, figsets, get_color
7564
+
7565
+ if columns is not None:
7566
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
7567
+ data=data[columns]
7568
+ len_total = len(res_qc)
7569
+ n_row, n_col = int((len_total + 10) / 3), 3
7570
+ nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7571
+
7572
+ missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
7573
+ ascending=False
7574
+ )
7575
+ if len(missing_data) > max_cols:
7576
+ missing_data = missing_data[:max_cols]
7577
+ ax=sns.barplot(
7578
+ x=missing_data.index,
7579
+ y=missing_data.values,
7580
+ hue=missing_data.index,
7581
+ palette=get_color(len(missing_data), cmap="Blues")[::-1],
7582
+ ax=nexttile(),
7583
+ )
7584
+ figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
7585
+
7586
+ ax2 = ax.twinx()
7587
+ # Plot missing value percentages
7588
+ missing_percentage = res_qc["missing_percentage"][
7589
+ res_qc["missing_percentage"] > 0
7590
+ ].sort_values(ascending=False)
7591
+ sns.barplot(
7592
+ x=missing_percentage.index,
7593
+ y=missing_percentage.values,
7594
+ hue=missing_percentage.index,
7595
+ palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
7596
+ ax=ax2,#nexttile(),
7597
+ )
7598
+ figsets(xangle=45, ylabel="%",ax=ax2)
7599
+ ax2.tick_params(axis="y", color='r',labelcolor='r')
7600
+ ax2.yaxis.label.set_color('r')
7601
+
7602
+ outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
7603
+ if len(outlier_num) > max_cols:
7604
+ outlier_num = outlier_num[:max_cols]
7605
+ ax_outlier_num=sns.barplot(
7606
+ x=outlier_num.index,
7607
+ y=outlier_num.values,
7608
+ hue=outlier_num.index,
7609
+ palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
7610
+ ax=nexttile(),
7611
+ )
7612
+ figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
7613
+ ax_outlier_percentage = ax_outlier_num.twinx()
7614
+ outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
7615
+ if len(outlier_percentage) > max_cols:
7616
+ outlier_percentage = outlier_percentage[:max_cols]
7617
+ ax_outlier_percentage=sns.barplot(
7618
+ x=outlier_percentage.index,
7619
+ y=outlier_percentage.values,
7620
+ hue=outlier_percentage.index,
7621
+ palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
7622
+ ax=ax2 #nexttile(),
7623
+ )
7624
+ figsets(
7625
+ xangle=45,
7626
+ ylabel="%",
7627
+ xlabel=None,
7628
+ ylim=[0, outlier_percentage.max() + 2],
7629
+ ax=ax_outlier_percentage
7630
+ )
7631
+ ax2.tick_params(axis="y", color='r',labelcolor='r')
7632
+ ax2.yaxis.label.set_color('r')
7633
+
7634
+ # Skewness and Kurtosis Plots
7635
+ skewness = res_qc["skewness"].sort_values(ascending=False)
7636
+ kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
7637
+ if not skewness.empty:
7638
+ ax_skewness=sns.barplot(
7639
+ x=skewness.index,
7640
+ y=skewness.values,
7641
+ hue=skewness.index,
7642
+ palette=get_color(len(skewness), cmap="coolwarm")[::-1],
7643
+ ax=nexttile(),
7644
+ )
7645
+ figsets(
7646
+ xangle=45,
7647
+ title="Highly Skewed Numeric Columns (Skewness > 1)",
7648
+ ylabel="Skewness",xlabel=None,ax=ax_skewness
7649
+ )
7650
+ if not kurtosis.empty:
7651
+ ax_kurtosis=sns.barplot(
7652
+ x=kurtosis.index,
7653
+ y=kurtosis.values,
7654
+ hue=kurtosis.index,
7655
+ palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
7656
+ ax=nexttile(),
7657
+ )
7658
+ figsets(
7659
+ xangle=45,
7660
+ title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
7661
+ ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
7662
+ )
7663
+
7664
+ # Entropy for Categorical Variables
7665
+ entropy_data = pd.Series(res_qc["entropy_categoricals"]).sort_values(
7666
+ ascending=False
7667
+ )
7668
+ ax_entropy_data=sns.barplot(
7669
+ x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
7670
+ )
7671
+ figsets(
7672
+ xangle=45,
7673
+ xlabel="Categorical Columns",
7674
+ title="Entropy of Categorical Variables",
7675
+ ylabel="Entropy (bits)",
7676
+ ax=ax_entropy_data
7677
+ )
7678
+ # Distribution Analysis: Boxplot for IQR
7679
+ ax_iqr=sns.boxplot(
7680
+ data=data[res_qc["distribution_analysis"].index],
7681
+ orient="v",
7682
+ palette="Set3",
7683
+ ax=nexttile(),
7684
+ )
7685
+ figsets(
7686
+ xangle=45,
7687
+ title="Range for Numeric Columns",
7688
+ ylabel="#",
7689
+ ax=ax_iqr
7690
+ )
7691
+ # unique counts
7692
+ unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
7693
+ ax_unique_counts_=sns.barplot(
7694
+ x=unique_counts.index,
7695
+ y=unique_counts.values,
7696
+ hue=unique_counts.index,
7697
+ palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
7698
+ ax=nexttile())
7699
+ figsets(
7700
+ xangle=45,
7701
+ title="Unique Counts",
7702
+ xlabel=None,
7703
+ ylabel="#",
7704
+ ax=ax_unique_counts_
7705
+ )
7706
+ # Binary Checking
7707
+ ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
7708
+ y=unique_counts[unique_counts<10].values,
7709
+ hue=unique_counts[unique_counts<10].index,
7710
+ palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
7711
+ ax=nexttile())
7712
+ plt.axhline(y=2, color="r", linestyle="--", lw=2)
7713
+ figsets(
7714
+ xangle=45,
7715
+ xlabel=None,
7716
+ title="Binary Checking",
7717
+ ylabel="#",
7718
+ ax=ax_unique_counts
7719
+ )
7720
+
7721
+ # dtypes counts
7722
+ dtype_counts = res_qc['dtype_counts']
7723
+ txt = []
7724
+ for tp in dtype_counts.index:
7725
+ txt.append(list(data.select_dtypes(include=tp).columns))
7726
+
7727
+ ax_dtype_counts = sns.barplot(
7728
+ x=dtype_counts.index,
7729
+ y=dtype_counts.values,
7730
+ color="#F3C8B2",
7731
+ ax=nexttile(),
7732
+ )
7733
+ max_columns_per_row = 1 # Maximum number of columns per row
7734
+ for i, tp in enumerate(dtype_counts.index):
7735
+ if i<=20:
7736
+ column_names = txt[i]
7737
+ # Split the column names into multiple lines if too long
7738
+ column_name_str = ", ".join(column_names)
7739
+ if len(column_name_str) > 40: # If column names are too long, split them
7740
+ column_name_str = "\n".join(
7741
+ [
7742
+ ", ".join(column_names[j : j + max_columns_per_row])
7743
+ for j in range(0, len(column_names), max_columns_per_row)
7744
+ ]
7745
+ )
7746
+ # Place text annotation with line breaks and rotate the text if needed
7747
+ ax_dtype_counts.text(
7748
+ i,
7749
+ dtype_counts.values[i],
7750
+ f"{column_name_str}",
7751
+ ha="center",
7752
+ va="top",
7753
+ c="k",
7754
+ fontsize=8,
7755
+ rotation=0,
7756
+ )
7757
+ figsets(
7758
+ xlabel=None,
7759
+ title="Dtypes",
7760
+ ylabel="#",
7761
+ ax=ax_dtype_counts
7762
+ )
7763
+
7764
+ # High cardinality: Show top categorical columns by unique value count
7765
+ high_cardinality = res_qc["high_cardinality_categoricals"]
7766
+ if high_cardinality and len(high_cardinality) > max_cols:
7767
+ high_cardinality = dict(
7768
+ sorted(high_cardinality.items(), key=lambda x: x[1], reverse=True)[
7769
+ :max_cols
7770
+ ]
7771
+ )
7772
+
7773
+ if high_cardinality:
7774
+ ax_high_cardinality=sns.barplot(
7775
+ x=list(high_cardinality.keys()),
7776
+ y=list(high_cardinality.values()),
7777
+ hue=list(high_cardinality.keys()),
7778
+ palette="Oranges", ax=nexttile()
7779
+ )
7780
+ figsets(
7781
+ xangle=45,
7782
+ title="High Cardinality Categorical Columns",
7783
+ ylabel="Unique Value Count",
7784
+ ax=ax_high_cardinality
7785
+ )
7786
+ if res_qc["low_variance_features"]:
7787
+ low_variance_data = data[res_qc["low_variance_features"]].copy()
7788
+ for col in low_variance_data.columns:
7789
+ sns.histplot(
7790
+ low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
7791
+ )
7792
+ plt.title(f"Low Variance Feature: {col}")
7793
+
7794
+ # VIF plot for multicollinearity detection
7795
+ if "vif" in res_qc and not res_qc["vif"].empty:
7796
+ vif_data = res_qc["vif"].sort_values(by="VIF", ascending=False)
7797
+ if len(vif_data) > max_cols:
7798
+ vif_data = vif_data[:max_cols]
7799
+ ax_vif=sns.barplot(data=vif_data,
7800
+ x="VIF",
7801
+ y="feature",
7802
+ hue="VIF",
7803
+ palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
7804
+ ax=nexttile())
7805
+ figsets(
7806
+ xangle=45,
7807
+ title="Variance Inflation Factor(VIF)",
7808
+ xlabel="Variance Inflation Factor(VIF)",
7809
+ ylabel="Features",
7810
+ legend=None,
7811
+ ax=ax_vif
7812
+ )
7813
+
7814
+ # Correlation heatmap for numeric columns with high correlation pairs
7815
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7816
+ corr = data.select_dtypes(include=[np.number]).dropna().corr()
7817
+ if corr.shape[1]<=33:
7818
+ mask = np.triu(np.ones_like(corr, dtype=bool))
7819
+ # Dynamically scale fontsize based on the number of columns
7820
+ num_columns = corr.shape[1]
7821
+ fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
7822
+
7823
+ ax_heatmap=sns.heatmap(
7824
+ corr,
7825
+ mask=mask,
7826
+ annot=True,
7827
+ cmap="coolwarm",
7828
+ center=0,
7829
+ fmt=".2f",
7830
+ linewidths=0.5,
7831
+ vmin=-1, vmax=1,
7832
+ ax=nexttile(2, 2),
7833
+ cbar_kws=dict(shrink=0.2,ticks=np.arange(-1, 2, 1)),
7834
+ annot_kws={"size": fontsize}
7835
+ )
7836
+
7837
+ figsets(
7838
+ xangle=45,
7839
+ title="Correlation Heatmap",
7840
+ ax=ax_heatmap
7841
+ )
7130
7842
 
7131
7843
  def use_pd(
7132
7844
  func_name="excel",