py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.26__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/usages_sns.json +6 -1
- py2ls/ips.py +399 -91
- py2ls/ml2ls.py +758 -186
- py2ls/netfinder.py +16 -20
- py2ls/plot.py +916 -141
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.26.dist-info}/METADATA +5 -1
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.26.dist-info}/RECORD +15 -13
- py2ls/data/usages_pd copy.json +0 -1105
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.26.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -16,6 +16,7 @@ import warnings
|
|
16
16
|
|
17
17
|
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
18
18
|
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
19
|
+
warnings.filterwarnings("ignore")
|
19
20
|
|
20
21
|
|
21
22
|
def run_once_within(duration=60,reverse=False): # default 60s
|
@@ -541,8 +542,7 @@ def is_text(s):
|
|
541
542
|
|
542
543
|
from typing import Any, Union
|
543
544
|
|
544
|
-
|
545
|
-
def shared(*args, strict=True, n_shared=2, verbose=True):
|
545
|
+
def share(*args, strict=True, n_shared=2, verbose=True):
|
546
546
|
"""
|
547
547
|
check the shared elelements in two list.
|
548
548
|
usage:
|
@@ -587,12 +587,68 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
|
|
587
587
|
elements2show = (
|
588
588
|
shared_elements if len(shared_elements) < 10 else shared_elements[:5]
|
589
589
|
)
|
590
|
+
tail = '' if len(shared_elements) < 10 else '......'
|
591
|
+
elements2show.append(tail)
|
590
592
|
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
591
593
|
print("********* checking shared elements *********")
|
592
594
|
return shared_elements
|
593
595
|
|
596
|
+
def shared(*args, n_shared=None, verbose=True,**kwargs):
|
597
|
+
"""
|
598
|
+
check the shared elelements in two list.
|
599
|
+
usage:
|
600
|
+
list1 = [1, 2, 3, 4, 5]
|
601
|
+
list2 = [4, 5, 6, 7, 8]
|
602
|
+
list3 = [5, 6, 9, 10]
|
603
|
+
a = shared(list1, list2,list3)
|
604
|
+
"""
|
605
|
+
if verbose:
|
606
|
+
print("\n********* checking shared elements *********")
|
607
|
+
|
608
|
+
if len(args) == 1 and isinstance(args[0], list):
|
609
|
+
lists = args[0] # Unpack the single list
|
610
|
+
else:
|
611
|
+
lists = args # Use the provided arguments as lists
|
612
|
+
flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
|
613
|
+
|
614
|
+
if n_shared is None:
|
615
|
+
n_shared = len(flattened_lists)
|
616
|
+
strict = True
|
617
|
+
else:
|
618
|
+
strict = False
|
619
|
+
# Ensure all arguments are lists
|
620
|
+
if any(not isinstance(lst, list) for lst in flattened_lists):
|
621
|
+
print(f"{' ' * 2}All inputs must be lists.")
|
622
|
+
return []
|
623
|
+
first_list = flattened_lists[0]
|
624
|
+
shared_elements = [
|
625
|
+
item for item in first_list if all(item in lst for lst in flattened_lists)
|
626
|
+
]
|
627
|
+
if strict:
|
628
|
+
# Strict mode: require elements to be in all lists
|
629
|
+
shared_elements = set(flattened_lists[0])
|
630
|
+
for lst in flattened_lists[1:]:
|
631
|
+
shared_elements.intersection_update(lst)
|
632
|
+
else:
|
633
|
+
from collections import Counter
|
594
634
|
|
595
|
-
|
635
|
+
all_elements = [item for sublist in flattened_lists for item in sublist]
|
636
|
+
element_count = Counter(all_elements)
|
637
|
+
# Get elements that appear in at least n_shared lists
|
638
|
+
shared_elements = [
|
639
|
+
item for item, count in element_count.items() if count >= n_shared
|
640
|
+
]
|
641
|
+
|
642
|
+
shared_elements = flatten(shared_elements, verbose=verbose)
|
643
|
+
if verbose:
|
644
|
+
elements2show = (
|
645
|
+
shared_elements if len(shared_elements) < 10 else shared_elements[:5]
|
646
|
+
)
|
647
|
+
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
648
|
+
print("********* checking shared elements *********")
|
649
|
+
return shared_elements
|
650
|
+
|
651
|
+
def share_not(*args, n_shared=None, verbose=False):
|
596
652
|
"""
|
597
653
|
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
598
654
|
usage:
|
@@ -600,7 +656,19 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
|
|
600
656
|
list2 = [4, 5, 6, 7, 8]
|
601
657
|
not_shared(list1,list2)# output [1,3]
|
602
658
|
"""
|
603
|
-
_common = shared(*args,
|
659
|
+
_common = shared(*args, n_shared=n_shared, verbose=verbose)
|
660
|
+
list1 = flatten(args[0], verbose=verbose)
|
661
|
+
_not_shared = [item for item in list1 if item not in _common]
|
662
|
+
return _not_shared
|
663
|
+
def not_shared(*args, n_shared=None, verbose=False):
|
664
|
+
"""
|
665
|
+
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
666
|
+
usage:
|
667
|
+
list1 = [1, 8, 3, 3, 4, 5]
|
668
|
+
list2 = [4, 5, 6, 7, 8]
|
669
|
+
not_shared(list1,list2)# output [1,3]
|
670
|
+
"""
|
671
|
+
_common = shared(*args, n_shared=n_shared, verbose=verbose)
|
604
672
|
list1 = flatten(args[0], verbose=verbose)
|
605
673
|
_not_shared = [item for item in list1 if item not in _common]
|
606
674
|
return _not_shared
|
@@ -1983,7 +2051,6 @@ def fload(fpath, kind=None, **kwargs):
|
|
1983
2051
|
|
1984
2052
|
def load_csv(fpath, **kwargs):
|
1985
2053
|
from pandas.errors import EmptyDataError
|
1986
|
-
|
1987
2054
|
engine = kwargs.pop("engine", "pyarrow")# default: None
|
1988
2055
|
sep = kwargs.pop("sep", None)# default: ','
|
1989
2056
|
index_col = kwargs.pop("index_col", None)# default: None
|
@@ -1994,13 +2061,20 @@ def fload(fpath, kind=None, **kwargs):
|
|
1994
2061
|
comment = kwargs.pop("comment", None)# default: None
|
1995
2062
|
fmt = kwargs.pop("fmt", False)# default:
|
1996
2063
|
chunksize = kwargs.pop("chunksize", None)# default: None
|
2064
|
+
|
2065
|
+
#check filesize
|
2066
|
+
f_size=round(os.path.getsize(fpath) / 1024 / 1024, 3)
|
2067
|
+
if f_size>=50: #50 MB
|
2068
|
+
if chunksize is None:
|
2069
|
+
chunksize = 5000
|
2070
|
+
print(f"file size is {f_size}MB, then set the chunksize with {chunksize}")
|
1997
2071
|
engine = "c" if chunksize else engine # when chunksize, recommend 'c'
|
1998
2072
|
low_memory = kwargs.pop("low_memory", True)# default: True
|
1999
2073
|
low_memory = (
|
2000
2074
|
False if chunksize else True
|
2001
2075
|
) # when chunksize, recommend low_memory=False # default:
|
2002
2076
|
verbose = kwargs.pop("verbose", False)
|
2003
|
-
if run_once_within():
|
2077
|
+
if run_once_within(reverse=True):
|
2004
2078
|
use_pd("read_csv", verbose=verbose)
|
2005
2079
|
|
2006
2080
|
if comment is None:# default: None
|
@@ -2176,7 +2250,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2176
2250
|
def load_excel(fpath, **kwargs):
|
2177
2251
|
engine = kwargs.get("engine", "openpyxl")
|
2178
2252
|
verbose = kwargs.pop("verbose", False)
|
2179
|
-
if run_once_within():
|
2253
|
+
if run_once_within(reverse=True):
|
2180
2254
|
use_pd("read_excel", verbose=verbose)
|
2181
2255
|
df = pd.read_excel(fpath, engine=engine, **kwargs)
|
2182
2256
|
try:
|
@@ -2206,7 +2280,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2206
2280
|
engine = kwargs.get("engine", "pyarrow")
|
2207
2281
|
verbose = kwargs.pop("verbose", False)
|
2208
2282
|
|
2209
|
-
if run_once_within():
|
2283
|
+
if run_once_within(reverse=True):
|
2210
2284
|
use_pd("read_parquet", verbose=verbose)
|
2211
2285
|
try:
|
2212
2286
|
df = pd.read_parquet(fpath, engine=engine, **kwargs)
|
@@ -2383,13 +2457,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
2383
2457
|
return load_xml(fpath)
|
2384
2458
|
elif kind in ["csv", "tsv"]:
|
2385
2459
|
# verbose = kwargs.pop("verbose", False)
|
2386
|
-
if run_once_within():
|
2460
|
+
if run_once_within(reverse=True):
|
2387
2461
|
use_pd("read_csv")
|
2388
2462
|
content = load_csv(fpath, **kwargs)
|
2389
2463
|
return content
|
2390
2464
|
elif kind == "pkl":
|
2391
2465
|
verbose = kwargs.pop("verbose", False)
|
2392
|
-
if run_once_within():
|
2466
|
+
if run_once_within(reverse=True):
|
2393
2467
|
use_pd("read_pickle")
|
2394
2468
|
return pd.read_pickle(fpath, **kwargs)
|
2395
2469
|
elif kind in ["ods", "ods", "odt"]:
|
@@ -2420,12 +2494,12 @@ def fload(fpath, kind=None, **kwargs):
|
|
2420
2494
|
return load_ipynb(fpath, **kwargs)
|
2421
2495
|
elif kind in ["parquet", "snappy"]:
|
2422
2496
|
verbose = kwargs.pop("verbose", False)
|
2423
|
-
if run_once_within():
|
2497
|
+
if run_once_within(reverse=True):
|
2424
2498
|
use_pd("read_parquet")
|
2425
2499
|
return load_parquet(fpath, **kwargs)
|
2426
2500
|
elif kind == "feather":
|
2427
2501
|
verbose = kwargs.pop("verbose", False)
|
2428
|
-
if run_once_within():
|
2502
|
+
if run_once_within(reverse=True):
|
2429
2503
|
use_pd("read_feather")
|
2430
2504
|
content = pd.read_feather(fpath, **kwargs)
|
2431
2505
|
return content
|
@@ -2684,7 +2758,7 @@ def fsave(
|
|
2684
2758
|
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
|
2685
2759
|
|
2686
2760
|
verbose = kwargs.pop("verbose", False)
|
2687
|
-
if run_once_within():
|
2761
|
+
if run_once_within(reverse=True):
|
2688
2762
|
use_pd("to_csv", verbose=verbose)
|
2689
2763
|
kwargs_csv = dict(
|
2690
2764
|
path_or_buf=None,
|
@@ -2716,7 +2790,7 @@ def fsave(
|
|
2716
2790
|
def save_xlsx(fpath, data, **kwargs):
|
2717
2791
|
verbose = kwargs.pop("verbose", False)
|
2718
2792
|
sheet_name = kwargs.pop("sheet_name", "Sheet1")
|
2719
|
-
if run_once_within():
|
2793
|
+
if run_once_within(reverse=True):
|
2720
2794
|
use_pd("to_excel", verbose=verbose)
|
2721
2795
|
if any(kwargs):
|
2722
2796
|
format_excel(df=data, filename=fpath, **kwargs)
|
@@ -5911,6 +5985,9 @@ def df_scaler(
|
|
5911
5985
|
scaler=None,
|
5912
5986
|
method="standard",
|
5913
5987
|
columns=None, # default, select all numeric col/row
|
5988
|
+
feature_range=None,# specific for 'minmax'
|
5989
|
+
vmin=0,
|
5990
|
+
vmax=1,
|
5914
5991
|
inplace=False,
|
5915
5992
|
verbose=False, # show usage
|
5916
5993
|
axis=0, # defalut column-wise
|
@@ -5943,11 +6020,13 @@ def df_scaler(
|
|
5943
6020
|
scaler = StandardScaler(**kwargs)
|
5944
6021
|
elif method == "minmax":
|
5945
6022
|
from sklearn.preprocessing import MinMaxScaler
|
6023
|
+
if feature_range is None:
|
6024
|
+
feature_range=(vmin,vmax)
|
5946
6025
|
if verbose:
|
5947
6026
|
print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
|
5948
6027
|
print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
|
5949
6028
|
print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
|
5950
|
-
scaler = MinMaxScaler(
|
6029
|
+
scaler = MinMaxScaler(feature_range=feature_range,**kwargs)
|
5951
6030
|
elif method == "robust":
|
5952
6031
|
from sklearn.preprocessing import RobustScaler
|
5953
6032
|
if verbose:
|
@@ -6035,15 +6114,20 @@ def df_special_characters_cleaner(
|
|
6035
6114
|
|
6036
6115
|
# 1. Clean column names by replacing special characters with underscores
|
6037
6116
|
if "column" in where_:
|
6038
|
-
|
6117
|
+
try:
|
6118
|
+
data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
|
6119
|
+
except Exception as e:
|
6120
|
+
print(e)
|
6039
6121
|
|
6040
6122
|
# 2. Clean only object-type columns (text columns)
|
6041
|
-
|
6042
|
-
|
6043
|
-
|
6044
|
-
|
6045
|
-
|
6046
|
-
|
6123
|
+
try:
|
6124
|
+
if "content" in where_:
|
6125
|
+
for col in data.select_dtypes(include=["object"]).columns:
|
6126
|
+
data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
|
6127
|
+
if data.index.dtype == "object" and index in where_:
|
6128
|
+
data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
|
6129
|
+
except:
|
6130
|
+
pass
|
6047
6131
|
return data
|
6048
6132
|
|
6049
6133
|
|
@@ -6426,6 +6510,9 @@ def df_reducer(
|
|
6426
6510
|
# "autoencoder","nmf",
|
6427
6511
|
]
|
6428
6512
|
method = strcmp(method, methods)[0]
|
6513
|
+
if run_once_within(reverse=True):
|
6514
|
+
print(f"support methods:{methods}")
|
6515
|
+
|
6429
6516
|
if verbose:
|
6430
6517
|
print(f"\nprocessing with using {dict_methods[method]}:")
|
6431
6518
|
xlabel, ylabel = None, None
|
@@ -6433,16 +6520,20 @@ def df_reducer(
|
|
6433
6520
|
columns = data.select_dtypes(include="number").columns.tolist()
|
6434
6521
|
if hue is None:
|
6435
6522
|
hue = data.select_dtypes(exclude="number").columns.tolist()
|
6523
|
+
print(f"auto select the non-number as 'hue':{hue}")
|
6436
6524
|
if isinstance(hue, list):
|
6437
6525
|
print("Warning: hue is a list, only select the 1st one")
|
6438
6526
|
hue = hue[0]
|
6439
|
-
if not hue:
|
6527
|
+
if not any(hue):
|
6440
6528
|
# Select columns if specified, else use all columns
|
6441
6529
|
X = data[columns].values if columns else data.values
|
6442
6530
|
else:
|
6443
6531
|
# Select columns to reduce and hue for LDA
|
6444
|
-
|
6445
|
-
|
6532
|
+
try:
|
6533
|
+
X = data[columns].values if columns else data.drop(columns=[hue]).values
|
6534
|
+
y = data[hue].values
|
6535
|
+
except:
|
6536
|
+
pass
|
6446
6537
|
print(X.shape)
|
6447
6538
|
# Handle missing values
|
6448
6539
|
if fill_missing:
|
@@ -6909,33 +7000,49 @@ def df_reducer(
|
|
6909
7000
|
colname_met = "SVD_"
|
6910
7001
|
# Quick plots
|
6911
7002
|
if plot_ and (not method in ["isolation_forest"]):
|
6912
|
-
from .plot import plotxy
|
6913
|
-
if ax is None:
|
6914
|
-
|
6915
|
-
|
6916
|
-
|
6917
|
-
|
6918
|
-
else:
|
6919
|
-
|
7003
|
+
from .plot import plotxy,figsets,get_color
|
7004
|
+
# if ax is None:
|
7005
|
+
# if figsize is None:
|
7006
|
+
# _, ax = plt.subplots(figsize=cm2inch(8, 8))
|
7007
|
+
# else:
|
7008
|
+
# _, ax = plt.subplots(figsize=figsize)
|
7009
|
+
# else:
|
7010
|
+
# ax = ax.cla()
|
6920
7011
|
xlabel = f"{colname_met}1" if xlabel is None else xlabel
|
6921
7012
|
ylabel = f"{colname_met}2" if ylabel is None else ylabel
|
7013
|
+
palette=get_color(len(flatten(data[hue],verbose=0)))
|
7014
|
+
|
7015
|
+
reduced_df=reduced_df.sort_values(by=hue)
|
7016
|
+
print(flatten(reduced_df[hue]))
|
6922
7017
|
ax = plotxy(
|
6923
7018
|
data=reduced_df,
|
6924
7019
|
x=colname_met + "1",
|
6925
7020
|
y=colname_met + "2",
|
6926
7021
|
hue=hue,
|
6927
|
-
|
7022
|
+
palette=palette,
|
7023
|
+
# size=size,
|
6928
7024
|
edgecolor=edgecolor,
|
6929
|
-
kind_="
|
6930
|
-
|
6931
|
-
|
6932
|
-
|
6933
|
-
|
6934
|
-
|
6935
|
-
|
7025
|
+
kind_=["joint",
|
7026
|
+
# "kde",
|
7027
|
+
"ell",
|
7028
|
+
],
|
7029
|
+
kws_kde=dict(
|
7030
|
+
hue=hue,
|
7031
|
+
levels=2,
|
7032
|
+
common_norm=False,
|
7033
|
+
fill=True,
|
7034
|
+
alpha=0.05,
|
7035
|
+
),
|
7036
|
+
kws_joint=dict(kind='scatter',joint_kws=dict(s=size)),
|
7037
|
+
kws_ellipse=dict(alpha=0.1,lw=1,label=None),
|
6936
7038
|
verbose=False,
|
6937
7039
|
**kwargs,
|
6938
7040
|
)
|
7041
|
+
figsets(
|
7042
|
+
legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
|
7043
|
+
xlabel=xlabel if xlabel else None,
|
7044
|
+
ylabel=ylabel if ylabel else None,
|
7045
|
+
)
|
6939
7046
|
|
6940
7047
|
if inplace:
|
6941
7048
|
# If inplace=True, add components back into the original data
|
@@ -7412,6 +7519,7 @@ def df_qc(
|
|
7412
7519
|
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
7413
7520
|
from scipy.stats import skew, kurtosis, entropy
|
7414
7521
|
|
7522
|
+
pd.options.display.max_seq_items = 10
|
7415
7523
|
#! display(data.select_dtypes(include=[np.number]).describe())
|
7416
7524
|
#!skim
|
7417
7525
|
if columns is not None:
|
@@ -7428,16 +7536,18 @@ def df_qc(
|
|
7428
7536
|
data = data.copy()
|
7429
7537
|
data.loc[:, data.isna().all()] = 0
|
7430
7538
|
res_qc = {}
|
7431
|
-
print(f"data.shape:{data.shape}")
|
7539
|
+
print(f"⤵ data.shape:{data.shape}\n⤵ data.sample(10):")
|
7540
|
+
display(data.sample(10).style.background_gradient(cmap="coolwarm", axis=1))
|
7432
7541
|
|
7433
7542
|
# Missing values
|
7434
7543
|
res_qc["missing_values"] = data.isnull().sum()
|
7435
|
-
res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
|
7544
|
+
res_qc["missing_percentage"] = round((res_qc["missing_values"] / len(data)) * 100,2)
|
7436
7545
|
res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
|
7437
7546
|
|
7438
7547
|
# Data types and unique values
|
7439
7548
|
res_qc["data_types"] = data.dtypes
|
7440
|
-
res_qc["
|
7549
|
+
res_qc["unique_counts"] = data.select_dtypes(exclude=np.number).nunique().sort_values()
|
7550
|
+
res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(lambda x: x.unique())
|
7441
7551
|
res_qc["constant_columns"] = [
|
7442
7552
|
col for col in data.columns if data[col].nunique() <= 1
|
7443
7553
|
]
|
@@ -7453,33 +7563,42 @@ def df_qc(
|
|
7453
7563
|
data_outliers = df_outlier(data)
|
7454
7564
|
outlier_num = data_outliers.isna().sum() - data.isnull().sum()
|
7455
7565
|
res_qc["outlier_num"] = outlier_num[outlier_num > 0]
|
7456
|
-
outlier_percentage=(outlier_num / len(data_outliers)) * 100
|
7566
|
+
outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
|
7457
7567
|
res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
|
7458
|
-
|
7459
|
-
|
7460
|
-
|
7461
|
-
|
7462
|
-
|
7463
|
-
|
7464
|
-
|
7465
|
-
|
7466
|
-
|
7467
|
-
|
7468
|
-
res_qc["high_correlations"] = high_corr_pairs
|
7469
|
-
|
7470
|
-
# VIF for multicollinearity check
|
7471
|
-
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7472
|
-
vif_data = pd.DataFrame()
|
7473
|
-
res_qc["vif"]=vif_data
|
7474
|
-
if numeric_df.shape[1] > 1 and not numeric_df.empty:
|
7475
|
-
vif_data["feature"] = numeric_df.columns
|
7476
|
-
vif_data["VIF"] = [
|
7477
|
-
variance_inflation_factor(numeric_df.values, i)
|
7478
|
-
for i in range(numeric_df.shape[1])
|
7568
|
+
try:
|
7569
|
+
# Correlation and multicollinearity (VIF)
|
7570
|
+
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7571
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7572
|
+
corr_matrix = numeric_df.corr()
|
7573
|
+
high_corr_pairs = [
|
7574
|
+
(col1, col2)
|
7575
|
+
for col1 in corr_matrix.columns
|
7576
|
+
for col2 in corr_matrix.columns
|
7577
|
+
if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
|
7479
7578
|
]
|
7480
|
-
res_qc["
|
7481
|
-
|
7482
|
-
|
7579
|
+
res_qc["high_correlations"] = high_corr_pairs
|
7580
|
+
|
7581
|
+
# VIF for multicollinearity check
|
7582
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7583
|
+
if isinstance(numeric_df.columns, pd.MultiIndex):
|
7584
|
+
numeric_df.columns = [
|
7585
|
+
"_".join(col).strip() if isinstance(col, tuple) else col for col in numeric_df.columns
|
7586
|
+
]
|
7587
|
+
|
7588
|
+
|
7589
|
+
vif_data = pd.DataFrame()
|
7590
|
+
res_qc["vif"]=vif_data
|
7591
|
+
if numeric_df.shape[1] > 1 and not numeric_df.empty:
|
7592
|
+
vif_data["feature"] = numeric_df.columns.tolist()
|
7593
|
+
vif_data["VIF"] = [
|
7594
|
+
round(variance_inflation_factor(numeric_df.values, i),2)
|
7595
|
+
for i in range(numeric_df.shape[1])
|
7596
|
+
]
|
7597
|
+
res_qc["vif"] = vif_data[
|
7598
|
+
vif_data["VIF"] > 5
|
7599
|
+
] # Typically VIF > 5 indicates multicollinearity
|
7600
|
+
except Exception as e:
|
7601
|
+
print(e)
|
7483
7602
|
# Skewness and Kurtosis
|
7484
7603
|
skewness = data.skew(numeric_only=True)
|
7485
7604
|
kurtosis_vals = data.kurt(numeric_only=True)
|
@@ -7492,8 +7611,7 @@ def df_qc(
|
|
7492
7611
|
col: entropy(data[col].value_counts(normalize=True), base=2)
|
7493
7612
|
for col in categorical_cols
|
7494
7613
|
}
|
7495
|
-
|
7496
|
-
res_qc["unique_counts"] = data.nunique()
|
7614
|
+
|
7497
7615
|
# dtypes counts
|
7498
7616
|
res_qc['dtype_counts']=data.dtypes.value_counts()
|
7499
7617
|
|
@@ -7540,7 +7658,7 @@ def df_qc(
|
|
7540
7658
|
res_qc["text_length_analysis"] = text_lengths
|
7541
7659
|
|
7542
7660
|
# Summary statistics
|
7543
|
-
res_qc["summary_statistics"] = data.describe().T
|
7661
|
+
res_qc["summary_statistics"] = data.describe().T.style.background_gradient(cmap='coolwarm', axis=0)
|
7544
7662
|
|
7545
7663
|
# Automated warnings
|
7546
7664
|
warnings = []
|
@@ -7562,28 +7680,45 @@ def df_qc(
|
|
7562
7680
|
|
7563
7681
|
# Report generation
|
7564
7682
|
if verbose:
|
7565
|
-
print("=== QC Report Summary ===")
|
7566
7683
|
print("\n⤵ Summary Statistics:")
|
7567
7684
|
display(res_qc["summary_statistics"])
|
7568
7685
|
print("\n⤵ Data Types:")
|
7569
7686
|
display(res_qc["data_types"])
|
7570
7687
|
if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
|
7571
7688
|
print(" ⤵ Missing Values Counts:")
|
7572
|
-
display(
|
7689
|
+
display(pd.DataFrame(
|
7690
|
+
{
|
7691
|
+
"missing_values": res_qc["missing_values"][res_qc["missing_values"] > 0],
|
7692
|
+
"missing_percent(%)": res_qc["missing_percentage"][
|
7693
|
+
res_qc["missing_percentage"] > 0
|
7694
|
+
],
|
7695
|
+
}
|
7696
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
7697
|
+
)
|
7573
7698
|
# print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
|
7574
7699
|
print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
|
7575
7700
|
|
7701
|
+
print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
|
7702
|
+
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
|
7703
|
+
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
|
7704
|
+
|
7576
7705
|
if any(res_qc["outlier_num"]):
|
7577
7706
|
print("\n⤵ Outlier Report:")
|
7578
|
-
display(
|
7579
|
-
|
7707
|
+
display(pd.DataFrame(
|
7708
|
+
{
|
7709
|
+
"outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
|
7710
|
+
"outlier_percentage(%)": res_qc["outlier_percentage"][
|
7711
|
+
res_qc["outlier_percentage"] > 0
|
7712
|
+
],
|
7713
|
+
}
|
7714
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
7715
|
+
)
|
7716
|
+
|
7717
|
+
if any(res_qc["unique_counts"]):
|
7580
7718
|
print("\n⤵ Unique Values per Column:")
|
7581
|
-
display(res_qc["
|
7719
|
+
display(pd.DataFrame({"unique_counts":res_qc["unique_counts"],
|
7720
|
+
"unique_values":res_qc["unique_values"]}).style.background_gradient(cmap="coolwarm", axis=0))
|
7582
7721
|
|
7583
|
-
print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
|
7584
|
-
|
7585
|
-
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
|
7586
|
-
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
|
7587
7722
|
|
7588
7723
|
if res_qc["empty_columns"]:
|
7589
7724
|
print("\n⤵ Empty Columns:", res_qc["empty_columns"])
|
@@ -7595,7 +7730,7 @@ def df_qc(
|
|
7595
7730
|
|
7596
7731
|
if "vif" in res_qc:
|
7597
7732
|
print("\n⤵ Features with High VIF (>|5|):")
|
7598
|
-
|
7733
|
+
display(res_qc["vif"].style.background_gradient(cmap="coolwarm", axis=0))
|
7599
7734
|
|
7600
7735
|
if any(res_qc["high_cardinality_categoricals"]):
|
7601
7736
|
print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
|
@@ -7614,6 +7749,8 @@ def df_qc(
|
|
7614
7749
|
print("\nWarnings:")
|
7615
7750
|
for warning in res_qc["warnings"]:
|
7616
7751
|
print(" -", warning)
|
7752
|
+
|
7753
|
+
pd.reset_option("display.max_seq_items")
|
7617
7754
|
if plot_:
|
7618
7755
|
df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
|
7619
7756
|
if output or not plot_:
|
@@ -7632,7 +7769,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7632
7769
|
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7633
7770
|
data=data[columns]
|
7634
7771
|
len_total = len(res_qc)
|
7635
|
-
n_row, n_col = int((len_total + 10)
|
7772
|
+
n_row, n_col = int((len_total + 10)), 3
|
7636
7773
|
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
7637
7774
|
|
7638
7775
|
missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
|
@@ -7789,8 +7926,10 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7789
7926
|
title="Dtypes",
|
7790
7927
|
ylabel="#",
|
7791
7928
|
ax=ax_dtype_counts,
|
7792
|
-
fontsize=8
|
7929
|
+
fontsize=8 if len(dtype_counts.index)<=20 else 6,
|
7793
7930
|
)
|
7931
|
+
# from .plot import pie
|
7932
|
+
# pie()
|
7794
7933
|
|
7795
7934
|
# High cardinality: Show top categorical columns by unique value count
|
7796
7935
|
high_cardinality = res_qc["high_cardinality_categoricals"]
|
@@ -7871,16 +8010,17 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7871
8010
|
title="Correlation Heatmap",
|
7872
8011
|
ax=ax_heatmap
|
7873
8012
|
)
|
7874
|
-
# save figure
|
7875
|
-
if dir_save:
|
7876
|
-
|
8013
|
+
# # save figure
|
8014
|
+
# if dir_save:
|
8015
|
+
# figsave(dir_save,f"qc_plot_{now_}.pdf")
|
7877
8016
|
|
7878
8017
|
if columns is not None:
|
7879
8018
|
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7880
8019
|
data=data[columns]
|
7881
|
-
|
7882
|
-
|
7883
|
-
|
8020
|
+
|
8021
|
+
# len_total = len(res_qc)
|
8022
|
+
# n_row, n_col = int((len_total + 10) / 3), 3
|
8023
|
+
# nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
7884
8024
|
#! check distribution
|
7885
8025
|
data_num = data.select_dtypes(include=np.number)
|
7886
8026
|
if len(data_num) > max_cols:
|
@@ -7907,7 +8047,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7907
8047
|
figsets(ylabel=f'Q-Q Plot:{column}',title=None)
|
7908
8048
|
# save figure
|
7909
8049
|
if dir_save:
|
7910
|
-
figsave(dir_save,f"
|
8050
|
+
figsave(dir_save,f"qc_plot_{now_}.pdf")
|
8051
|
+
|
8052
|
+
def df_corr(df: pd.DataFrame, method="pearson"):
|
8053
|
+
"""
|
8054
|
+
Compute correlation coefficients and p-values for a DataFrame.
|
8055
|
+
|
8056
|
+
Parameters:
|
8057
|
+
- df (pd.DataFrame): Input DataFrame with numeric data.
|
8058
|
+
- method (str): Correlation method ("pearson", "spearman", "kendall").
|
8059
|
+
|
8060
|
+
Returns:
|
8061
|
+
- corr_matrix (pd.DataFrame): Correlation coefficient matrix.
|
8062
|
+
- pval_matrix (pd.DataFrame): P-value matrix.
|
8063
|
+
"""
|
8064
|
+
from scipy.stats import pearsonr, spearmanr, kendalltau
|
8065
|
+
|
8066
|
+
methods = ["pearson", "spearman", "kendall"]
|
8067
|
+
method = strcmp(method, methods)[0]
|
8068
|
+
methods_dict = {"pearson": pearsonr, "spearman": spearmanr, "kendall": kendalltau}
|
8069
|
+
|
8070
|
+
cols = df.columns
|
8071
|
+
corr_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
|
8072
|
+
pval_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
|
8073
|
+
correlation_func = methods_dict[method]
|
8074
|
+
|
8075
|
+
for col1 in cols:
|
8076
|
+
for col2 in cols:
|
8077
|
+
if col1 == col2:
|
8078
|
+
corr_matrix.loc[col1, col2] = 1.0
|
8079
|
+
pval_matrix.loc[col1, col2] = 0.0
|
8080
|
+
else:
|
8081
|
+
corr, pval = correlation_func(df[col1], df[col2])
|
8082
|
+
corr_matrix.loc[col1, col2] = corr
|
8083
|
+
pval_matrix.loc[col1, col2] = pval
|
8084
|
+
|
8085
|
+
return corr_matrix, pval_matrix
|
8086
|
+
|
7911
8087
|
def use_pd(
|
7912
8088
|
func_name="excel",
|
7913
8089
|
verbose=True,
|
@@ -7927,3 +8103,135 @@ def use_pd(
|
|
7927
8103
|
except Exception as e:
|
7928
8104
|
if verbose:
|
7929
8105
|
print(e)
|
8106
|
+
|
8107
|
+
def get_phone(phone_number: str, region: str = None,verbose=True):
|
8108
|
+
"""
|
8109
|
+
usage:
|
8110
|
+
info = get_phone(15237654321, "DE")
|
8111
|
+
preview(info)
|
8112
|
+
|
8113
|
+
Extremely advanced phone number analysis function.
|
8114
|
+
|
8115
|
+
Args:
|
8116
|
+
phone_number (str): The phone number to analyze.
|
8117
|
+
region (str): None (Default). Tries to work with international numbers including country codes; otherwise, uses the specified region.
|
8118
|
+
|
8119
|
+
Returns:
|
8120
|
+
dict: Comprehensive information about the phone number.
|
8121
|
+
"""
|
8122
|
+
import phonenumbers
|
8123
|
+
from phonenumbers import geocoder, carrier, timezone, number_type
|
8124
|
+
from datetime import datetime
|
8125
|
+
import pytz
|
8126
|
+
from tzlocal import get_localzone
|
8127
|
+
|
8128
|
+
if not isinstance(phone_number, str):
|
8129
|
+
phone_number = str(phone_number)
|
8130
|
+
if isinstance(region, str):
|
8131
|
+
region = region.upper()
|
8132
|
+
|
8133
|
+
try:
|
8134
|
+
# Parse the phone number
|
8135
|
+
parsed_number = phonenumbers.parse(phone_number, region)
|
8136
|
+
|
8137
|
+
# Validate the phone number
|
8138
|
+
valid = phonenumbers.is_valid_number(parsed_number)
|
8139
|
+
possible = phonenumbers.is_possible_number(parsed_number)
|
8140
|
+
|
8141
|
+
if not valid:
|
8142
|
+
suggested_fix = phonenumbers.example_number(region) if region else "Unknown"
|
8143
|
+
return {
|
8144
|
+
"valid": False,
|
8145
|
+
"error": "Invalid phone number",
|
8146
|
+
"suggested_fix": suggested_fix,
|
8147
|
+
}
|
8148
|
+
|
8149
|
+
# Basic details
|
8150
|
+
formatted_international = phonenumbers.format_number(
|
8151
|
+
parsed_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL
|
8152
|
+
)
|
8153
|
+
formatted_national = phonenumbers.format_number(
|
8154
|
+
parsed_number, phonenumbers.PhoneNumberFormat.NATIONAL
|
8155
|
+
)
|
8156
|
+
formatted_e164 = phonenumbers.format_number(
|
8157
|
+
parsed_number, phonenumbers.PhoneNumberFormat.E164
|
8158
|
+
)
|
8159
|
+
country_code = parsed_number.country_code
|
8160
|
+
region_code = geocoder.region_code_for_number(parsed_number)
|
8161
|
+
country_name = geocoder.country_name_for_number(parsed_number, "en")
|
8162
|
+
|
8163
|
+
location = geocoder.description_for_number(parsed_number, "en")
|
8164
|
+
carrier_name = carrier.name_for_number(parsed_number, "en") or "Unknown Carrier"
|
8165
|
+
time_zones = timezone.time_zones_for_number(parsed_number)[0]
|
8166
|
+
current_times = datetime.now(pytz.timezone(time_zones)).strftime(
|
8167
|
+
"%Y-%m-%d %H:%M:%S %Z"
|
8168
|
+
)
|
8169
|
+
number_type_str = {
|
8170
|
+
phonenumbers.PhoneNumberType.FIXED_LINE: "Fixed Line",
|
8171
|
+
phonenumbers.PhoneNumberType.MOBILE: "Mobile",
|
8172
|
+
phonenumbers.PhoneNumberType.FIXED_LINE_OR_MOBILE: "Fixed Line or Mobile",
|
8173
|
+
phonenumbers.PhoneNumberType.TOLL_FREE: "Toll Free",
|
8174
|
+
phonenumbers.PhoneNumberType.PREMIUM_RATE: "Premium Rate",
|
8175
|
+
phonenumbers.PhoneNumberType.SHARED_COST: "Shared Cost",
|
8176
|
+
phonenumbers.PhoneNumberType.VOIP: "VOIP",
|
8177
|
+
phonenumbers.PhoneNumberType.PERSONAL_NUMBER: "Personal Number",
|
8178
|
+
phonenumbers.PhoneNumberType.PAGER: "Pager",
|
8179
|
+
phonenumbers.PhoneNumberType.UAN: "UAN",
|
8180
|
+
phonenumbers.PhoneNumberType.UNKNOWN: "Unknown",
|
8181
|
+
}.get(number_type(parsed_number), "Unknown")
|
8182
|
+
|
8183
|
+
# Advanced Features
|
8184
|
+
is_toll_free = (
|
8185
|
+
number_type(parsed_number) == phonenumbers.PhoneNumberType.TOLL_FREE
|
8186
|
+
)
|
8187
|
+
is_premium_rate = (
|
8188
|
+
number_type(parsed_number) == phonenumbers.PhoneNumberType.PREMIUM_RATE
|
8189
|
+
)
|
8190
|
+
|
8191
|
+
# Dialing Information
|
8192
|
+
dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
|
8193
|
+
|
8194
|
+
# Advanced Timezone Handling
|
8195
|
+
gmt_offsets = pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds()/ 3600
|
8196
|
+
# Get the local timezone (current computer's time)
|
8197
|
+
local_timezone = get_localzone()
|
8198
|
+
#local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
|
8199
|
+
local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
|
8200
|
+
offset_diff = local_offset - gmt_offsets
|
8201
|
+
head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
|
8202
|
+
res= {
|
8203
|
+
"valid": True,
|
8204
|
+
"possible": possible,
|
8205
|
+
"formatted": {
|
8206
|
+
"international": formatted_international,
|
8207
|
+
"national": formatted_national,
|
8208
|
+
"e164": formatted_e164,
|
8209
|
+
},
|
8210
|
+
"country_code": country_code,
|
8211
|
+
"country_name": country_name,
|
8212
|
+
"region_code": region_code,
|
8213
|
+
"location": location if location else "Unknown",
|
8214
|
+
"carrier": carrier_name,
|
8215
|
+
"time_zone": time_zones,
|
8216
|
+
"current_times": current_times,
|
8217
|
+
"local_offset":f"{local_offset} utcoffset",
|
8218
|
+
"time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
|
8219
|
+
"number_type": number_type_str,
|
8220
|
+
"is_toll_free": is_toll_free,
|
8221
|
+
"is_premium_rate": is_premium_rate,
|
8222
|
+
"dialing_instructions": dialing_instructions,
|
8223
|
+
"suggested_fix": None, # Use phonenumbers.example_number if invalid
|
8224
|
+
"logs": {
|
8225
|
+
"number_analysis_completed": datetime.now().strftime(
|
8226
|
+
"%Y-%m-%d %H:%M:%S"
|
8227
|
+
),
|
8228
|
+
"raw_input": phone_number,
|
8229
|
+
"parsed_number": str(parsed_number),
|
8230
|
+
},
|
8231
|
+
}
|
8232
|
+
|
8233
|
+
except phonenumbers.NumberParseException as e:
|
8234
|
+
res= {"valid": False, "error": str(e)}
|
8235
|
+
if verbose:
|
8236
|
+
preview(res)
|
8237
|
+
return res
|