py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/usages_sns.json +6 -1
- py2ls/ips.py +399 -91
- py2ls/ml2ls.py +758 -186
- py2ls/netfinder.py +16 -20
- py2ls/plot.py +916 -141
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.26.dist-info}/METADATA +5 -1
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.26.dist-info}/RECORD +15 -13
- py2ls/data/usages_pd copy.json +0 -1105
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.26.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -16,6 +16,7 @@ import warnings
|
|
16
16
|
|
17
17
|
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
18
18
|
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
19
|
+
warnings.filterwarnings("ignore")
|
19
20
|
|
20
21
|
|
21
22
|
def run_once_within(duration=60,reverse=False): # default 60s
|
@@ -541,8 +542,7 @@ def is_text(s):
|
|
541
542
|
|
542
543
|
from typing import Any, Union
|
543
544
|
|
544
|
-
|
545
|
-
def shared(*args, strict=True, n_shared=2, verbose=True):
|
545
|
+
def share(*args, strict=True, n_shared=2, verbose=True):
|
546
546
|
"""
|
547
547
|
check the shared elelements in two list.
|
548
548
|
usage:
|
@@ -587,12 +587,68 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
|
|
587
587
|
elements2show = (
|
588
588
|
shared_elements if len(shared_elements) < 10 else shared_elements[:5]
|
589
589
|
)
|
590
|
+
tail = '' if len(shared_elements) < 10 else '......'
|
591
|
+
elements2show.append(tail)
|
590
592
|
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
591
593
|
print("********* checking shared elements *********")
|
592
594
|
return shared_elements
|
593
595
|
|
596
|
+
def shared(*args, n_shared=None, verbose=True,**kwargs):
|
597
|
+
"""
|
598
|
+
check the shared elelements in two list.
|
599
|
+
usage:
|
600
|
+
list1 = [1, 2, 3, 4, 5]
|
601
|
+
list2 = [4, 5, 6, 7, 8]
|
602
|
+
list3 = [5, 6, 9, 10]
|
603
|
+
a = shared(list1, list2,list3)
|
604
|
+
"""
|
605
|
+
if verbose:
|
606
|
+
print("\n********* checking shared elements *********")
|
607
|
+
|
608
|
+
if len(args) == 1 and isinstance(args[0], list):
|
609
|
+
lists = args[0] # Unpack the single list
|
610
|
+
else:
|
611
|
+
lists = args # Use the provided arguments as lists
|
612
|
+
flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
|
613
|
+
|
614
|
+
if n_shared is None:
|
615
|
+
n_shared = len(flattened_lists)
|
616
|
+
strict = True
|
617
|
+
else:
|
618
|
+
strict = False
|
619
|
+
# Ensure all arguments are lists
|
620
|
+
if any(not isinstance(lst, list) for lst in flattened_lists):
|
621
|
+
print(f"{' ' * 2}All inputs must be lists.")
|
622
|
+
return []
|
623
|
+
first_list = flattened_lists[0]
|
624
|
+
shared_elements = [
|
625
|
+
item for item in first_list if all(item in lst for lst in flattened_lists)
|
626
|
+
]
|
627
|
+
if strict:
|
628
|
+
# Strict mode: require elements to be in all lists
|
629
|
+
shared_elements = set(flattened_lists[0])
|
630
|
+
for lst in flattened_lists[1:]:
|
631
|
+
shared_elements.intersection_update(lst)
|
632
|
+
else:
|
633
|
+
from collections import Counter
|
594
634
|
|
595
|
-
|
635
|
+
all_elements = [item for sublist in flattened_lists for item in sublist]
|
636
|
+
element_count = Counter(all_elements)
|
637
|
+
# Get elements that appear in at least n_shared lists
|
638
|
+
shared_elements = [
|
639
|
+
item for item, count in element_count.items() if count >= n_shared
|
640
|
+
]
|
641
|
+
|
642
|
+
shared_elements = flatten(shared_elements, verbose=verbose)
|
643
|
+
if verbose:
|
644
|
+
elements2show = (
|
645
|
+
shared_elements if len(shared_elements) < 10 else shared_elements[:5]
|
646
|
+
)
|
647
|
+
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
648
|
+
print("********* checking shared elements *********")
|
649
|
+
return shared_elements
|
650
|
+
|
651
|
+
def share_not(*args, n_shared=None, verbose=False):
|
596
652
|
"""
|
597
653
|
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
598
654
|
usage:
|
@@ -600,7 +656,19 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
|
|
600
656
|
list2 = [4, 5, 6, 7, 8]
|
601
657
|
not_shared(list1,list2)# output [1,3]
|
602
658
|
"""
|
603
|
-
_common = shared(*args,
|
659
|
+
_common = shared(*args, n_shared=n_shared, verbose=verbose)
|
660
|
+
list1 = flatten(args[0], verbose=verbose)
|
661
|
+
_not_shared = [item for item in list1 if item not in _common]
|
662
|
+
return _not_shared
|
663
|
+
def not_shared(*args, n_shared=None, verbose=False):
|
664
|
+
"""
|
665
|
+
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
666
|
+
usage:
|
667
|
+
list1 = [1, 8, 3, 3, 4, 5]
|
668
|
+
list2 = [4, 5, 6, 7, 8]
|
669
|
+
not_shared(list1,list2)# output [1,3]
|
670
|
+
"""
|
671
|
+
_common = shared(*args, n_shared=n_shared, verbose=verbose)
|
604
672
|
list1 = flatten(args[0], verbose=verbose)
|
605
673
|
_not_shared = [item for item in list1 if item not in _common]
|
606
674
|
return _not_shared
|
@@ -1983,7 +2051,6 @@ def fload(fpath, kind=None, **kwargs):
|
|
1983
2051
|
|
1984
2052
|
def load_csv(fpath, **kwargs):
|
1985
2053
|
from pandas.errors import EmptyDataError
|
1986
|
-
|
1987
2054
|
engine = kwargs.pop("engine", "pyarrow")# default: None
|
1988
2055
|
sep = kwargs.pop("sep", None)# default: ','
|
1989
2056
|
index_col = kwargs.pop("index_col", None)# default: None
|
@@ -1994,13 +2061,20 @@ def fload(fpath, kind=None, **kwargs):
|
|
1994
2061
|
comment = kwargs.pop("comment", None)# default: None
|
1995
2062
|
fmt = kwargs.pop("fmt", False)# default:
|
1996
2063
|
chunksize = kwargs.pop("chunksize", None)# default: None
|
2064
|
+
|
2065
|
+
#check filesize
|
2066
|
+
f_size=round(os.path.getsize(fpath) / 1024 / 1024, 3)
|
2067
|
+
if f_size>=50: #50 MB
|
2068
|
+
if chunksize is None:
|
2069
|
+
chunksize = 5000
|
2070
|
+
print(f"file size is {f_size}MB, then set the chunksize with {chunksize}")
|
1997
2071
|
engine = "c" if chunksize else engine # when chunksize, recommend 'c'
|
1998
2072
|
low_memory = kwargs.pop("low_memory", True)# default: True
|
1999
2073
|
low_memory = (
|
2000
2074
|
False if chunksize else True
|
2001
2075
|
) # when chunksize, recommend low_memory=False # default:
|
2002
2076
|
verbose = kwargs.pop("verbose", False)
|
2003
|
-
if run_once_within():
|
2077
|
+
if run_once_within(reverse=True):
|
2004
2078
|
use_pd("read_csv", verbose=verbose)
|
2005
2079
|
|
2006
2080
|
if comment is None:# default: None
|
@@ -2176,7 +2250,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2176
2250
|
def load_excel(fpath, **kwargs):
|
2177
2251
|
engine = kwargs.get("engine", "openpyxl")
|
2178
2252
|
verbose = kwargs.pop("verbose", False)
|
2179
|
-
if run_once_within():
|
2253
|
+
if run_once_within(reverse=True):
|
2180
2254
|
use_pd("read_excel", verbose=verbose)
|
2181
2255
|
df = pd.read_excel(fpath, engine=engine, **kwargs)
|
2182
2256
|
try:
|
@@ -2206,7 +2280,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2206
2280
|
engine = kwargs.get("engine", "pyarrow")
|
2207
2281
|
verbose = kwargs.pop("verbose", False)
|
2208
2282
|
|
2209
|
-
if run_once_within():
|
2283
|
+
if run_once_within(reverse=True):
|
2210
2284
|
use_pd("read_parquet", verbose=verbose)
|
2211
2285
|
try:
|
2212
2286
|
df = pd.read_parquet(fpath, engine=engine, **kwargs)
|
@@ -2383,13 +2457,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
2383
2457
|
return load_xml(fpath)
|
2384
2458
|
elif kind in ["csv", "tsv"]:
|
2385
2459
|
# verbose = kwargs.pop("verbose", False)
|
2386
|
-
if run_once_within():
|
2460
|
+
if run_once_within(reverse=True):
|
2387
2461
|
use_pd("read_csv")
|
2388
2462
|
content = load_csv(fpath, **kwargs)
|
2389
2463
|
return content
|
2390
2464
|
elif kind == "pkl":
|
2391
2465
|
verbose = kwargs.pop("verbose", False)
|
2392
|
-
if run_once_within():
|
2466
|
+
if run_once_within(reverse=True):
|
2393
2467
|
use_pd("read_pickle")
|
2394
2468
|
return pd.read_pickle(fpath, **kwargs)
|
2395
2469
|
elif kind in ["ods", "ods", "odt"]:
|
@@ -2420,12 +2494,12 @@ def fload(fpath, kind=None, **kwargs):
|
|
2420
2494
|
return load_ipynb(fpath, **kwargs)
|
2421
2495
|
elif kind in ["parquet", "snappy"]:
|
2422
2496
|
verbose = kwargs.pop("verbose", False)
|
2423
|
-
if run_once_within():
|
2497
|
+
if run_once_within(reverse=True):
|
2424
2498
|
use_pd("read_parquet")
|
2425
2499
|
return load_parquet(fpath, **kwargs)
|
2426
2500
|
elif kind == "feather":
|
2427
2501
|
verbose = kwargs.pop("verbose", False)
|
2428
|
-
if run_once_within():
|
2502
|
+
if run_once_within(reverse=True):
|
2429
2503
|
use_pd("read_feather")
|
2430
2504
|
content = pd.read_feather(fpath, **kwargs)
|
2431
2505
|
return content
|
@@ -2684,7 +2758,7 @@ def fsave(
|
|
2684
2758
|
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
|
2685
2759
|
|
2686
2760
|
verbose = kwargs.pop("verbose", False)
|
2687
|
-
if run_once_within():
|
2761
|
+
if run_once_within(reverse=True):
|
2688
2762
|
use_pd("to_csv", verbose=verbose)
|
2689
2763
|
kwargs_csv = dict(
|
2690
2764
|
path_or_buf=None,
|
@@ -2716,7 +2790,7 @@ def fsave(
|
|
2716
2790
|
def save_xlsx(fpath, data, **kwargs):
|
2717
2791
|
verbose = kwargs.pop("verbose", False)
|
2718
2792
|
sheet_name = kwargs.pop("sheet_name", "Sheet1")
|
2719
|
-
if run_once_within():
|
2793
|
+
if run_once_within(reverse=True):
|
2720
2794
|
use_pd("to_excel", verbose=verbose)
|
2721
2795
|
if any(kwargs):
|
2722
2796
|
format_excel(df=data, filename=fpath, **kwargs)
|
@@ -5911,6 +5985,9 @@ def df_scaler(
|
|
5911
5985
|
scaler=None,
|
5912
5986
|
method="standard",
|
5913
5987
|
columns=None, # default, select all numeric col/row
|
5988
|
+
feature_range=None,# specific for 'minmax'
|
5989
|
+
vmin=0,
|
5990
|
+
vmax=1,
|
5914
5991
|
inplace=False,
|
5915
5992
|
verbose=False, # show usage
|
5916
5993
|
axis=0, # defalut column-wise
|
@@ -5943,11 +6020,13 @@ def df_scaler(
|
|
5943
6020
|
scaler = StandardScaler(**kwargs)
|
5944
6021
|
elif method == "minmax":
|
5945
6022
|
from sklearn.preprocessing import MinMaxScaler
|
6023
|
+
if feature_range is None:
|
6024
|
+
feature_range=(vmin,vmax)
|
5946
6025
|
if verbose:
|
5947
6026
|
print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
|
5948
6027
|
print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
|
5949
6028
|
print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
|
5950
|
-
scaler = MinMaxScaler(
|
6029
|
+
scaler = MinMaxScaler(feature_range=feature_range,**kwargs)
|
5951
6030
|
elif method == "robust":
|
5952
6031
|
from sklearn.preprocessing import RobustScaler
|
5953
6032
|
if verbose:
|
@@ -6035,15 +6114,20 @@ def df_special_characters_cleaner(
|
|
6035
6114
|
|
6036
6115
|
# 1. Clean column names by replacing special characters with underscores
|
6037
6116
|
if "column" in where_:
|
6038
|
-
|
6117
|
+
try:
|
6118
|
+
data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
|
6119
|
+
except Exception as e:
|
6120
|
+
print(e)
|
6039
6121
|
|
6040
6122
|
# 2. Clean only object-type columns (text columns)
|
6041
|
-
|
6042
|
-
|
6043
|
-
|
6044
|
-
|
6045
|
-
|
6046
|
-
|
6123
|
+
try:
|
6124
|
+
if "content" in where_:
|
6125
|
+
for col in data.select_dtypes(include=["object"]).columns:
|
6126
|
+
data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
|
6127
|
+
if data.index.dtype == "object" and index in where_:
|
6128
|
+
data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
|
6129
|
+
except:
|
6130
|
+
pass
|
6047
6131
|
return data
|
6048
6132
|
|
6049
6133
|
|
@@ -6426,6 +6510,9 @@ def df_reducer(
|
|
6426
6510
|
# "autoencoder","nmf",
|
6427
6511
|
]
|
6428
6512
|
method = strcmp(method, methods)[0]
|
6513
|
+
if run_once_within(reverse=True):
|
6514
|
+
print(f"support methods:{methods}")
|
6515
|
+
|
6429
6516
|
if verbose:
|
6430
6517
|
print(f"\nprocessing with using {dict_methods[method]}:")
|
6431
6518
|
xlabel, ylabel = None, None
|
@@ -6433,16 +6520,20 @@ def df_reducer(
|
|
6433
6520
|
columns = data.select_dtypes(include="number").columns.tolist()
|
6434
6521
|
if hue is None:
|
6435
6522
|
hue = data.select_dtypes(exclude="number").columns.tolist()
|
6523
|
+
print(f"auto select the non-number as 'hue':{hue}")
|
6436
6524
|
if isinstance(hue, list):
|
6437
6525
|
print("Warning: hue is a list, only select the 1st one")
|
6438
6526
|
hue = hue[0]
|
6439
|
-
if not hue:
|
6527
|
+
if not any(hue):
|
6440
6528
|
# Select columns if specified, else use all columns
|
6441
6529
|
X = data[columns].values if columns else data.values
|
6442
6530
|
else:
|
6443
6531
|
# Select columns to reduce and hue for LDA
|
6444
|
-
|
6445
|
-
|
6532
|
+
try:
|
6533
|
+
X = data[columns].values if columns else data.drop(columns=[hue]).values
|
6534
|
+
y = data[hue].values
|
6535
|
+
except:
|
6536
|
+
pass
|
6446
6537
|
print(X.shape)
|
6447
6538
|
# Handle missing values
|
6448
6539
|
if fill_missing:
|
@@ -6909,33 +7000,49 @@ def df_reducer(
|
|
6909
7000
|
colname_met = "SVD_"
|
6910
7001
|
# Quick plots
|
6911
7002
|
if plot_ and (not method in ["isolation_forest"]):
|
6912
|
-
from .plot import plotxy
|
6913
|
-
if ax is None:
|
6914
|
-
|
6915
|
-
|
6916
|
-
|
6917
|
-
|
6918
|
-
else:
|
6919
|
-
|
7003
|
+
from .plot import plotxy,figsets,get_color
|
7004
|
+
# if ax is None:
|
7005
|
+
# if figsize is None:
|
7006
|
+
# _, ax = plt.subplots(figsize=cm2inch(8, 8))
|
7007
|
+
# else:
|
7008
|
+
# _, ax = plt.subplots(figsize=figsize)
|
7009
|
+
# else:
|
7010
|
+
# ax = ax.cla()
|
6920
7011
|
xlabel = f"{colname_met}1" if xlabel is None else xlabel
|
6921
7012
|
ylabel = f"{colname_met}2" if ylabel is None else ylabel
|
7013
|
+
palette=get_color(len(flatten(data[hue],verbose=0)))
|
7014
|
+
|
7015
|
+
reduced_df=reduced_df.sort_values(by=hue)
|
7016
|
+
print(flatten(reduced_df[hue]))
|
6922
7017
|
ax = plotxy(
|
6923
7018
|
data=reduced_df,
|
6924
7019
|
x=colname_met + "1",
|
6925
7020
|
y=colname_met + "2",
|
6926
7021
|
hue=hue,
|
6927
|
-
|
7022
|
+
palette=palette,
|
7023
|
+
# size=size,
|
6928
7024
|
edgecolor=edgecolor,
|
6929
|
-
kind_="
|
6930
|
-
|
6931
|
-
|
6932
|
-
|
6933
|
-
|
6934
|
-
|
6935
|
-
|
7025
|
+
kind_=["joint",
|
7026
|
+
# "kde",
|
7027
|
+
"ell",
|
7028
|
+
],
|
7029
|
+
kws_kde=dict(
|
7030
|
+
hue=hue,
|
7031
|
+
levels=2,
|
7032
|
+
common_norm=False,
|
7033
|
+
fill=True,
|
7034
|
+
alpha=0.05,
|
7035
|
+
),
|
7036
|
+
kws_joint=dict(kind='scatter',joint_kws=dict(s=size)),
|
7037
|
+
kws_ellipse=dict(alpha=0.1,lw=1,label=None),
|
6936
7038
|
verbose=False,
|
6937
7039
|
**kwargs,
|
6938
7040
|
)
|
7041
|
+
figsets(
|
7042
|
+
legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
|
7043
|
+
xlabel=xlabel if xlabel else None,
|
7044
|
+
ylabel=ylabel if ylabel else None,
|
7045
|
+
)
|
6939
7046
|
|
6940
7047
|
if inplace:
|
6941
7048
|
# If inplace=True, add components back into the original data
|
@@ -7412,6 +7519,7 @@ def df_qc(
|
|
7412
7519
|
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
7413
7520
|
from scipy.stats import skew, kurtosis, entropy
|
7414
7521
|
|
7522
|
+
pd.options.display.max_seq_items = 10
|
7415
7523
|
#! display(data.select_dtypes(include=[np.number]).describe())
|
7416
7524
|
#!skim
|
7417
7525
|
if columns is not None:
|
@@ -7428,16 +7536,18 @@ def df_qc(
|
|
7428
7536
|
data = data.copy()
|
7429
7537
|
data.loc[:, data.isna().all()] = 0
|
7430
7538
|
res_qc = {}
|
7431
|
-
print(f"data.shape:{data.shape}")
|
7539
|
+
print(f"⤵ data.shape:{data.shape}\n⤵ data.sample(10):")
|
7540
|
+
display(data.sample(10).style.background_gradient(cmap="coolwarm", axis=1))
|
7432
7541
|
|
7433
7542
|
# Missing values
|
7434
7543
|
res_qc["missing_values"] = data.isnull().sum()
|
7435
|
-
res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
|
7544
|
+
res_qc["missing_percentage"] = round((res_qc["missing_values"] / len(data)) * 100,2)
|
7436
7545
|
res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
|
7437
7546
|
|
7438
7547
|
# Data types and unique values
|
7439
7548
|
res_qc["data_types"] = data.dtypes
|
7440
|
-
res_qc["
|
7549
|
+
res_qc["unique_counts"] = data.select_dtypes(exclude=np.number).nunique().sort_values()
|
7550
|
+
res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(lambda x: x.unique())
|
7441
7551
|
res_qc["constant_columns"] = [
|
7442
7552
|
col for col in data.columns if data[col].nunique() <= 1
|
7443
7553
|
]
|
@@ -7453,33 +7563,42 @@ def df_qc(
|
|
7453
7563
|
data_outliers = df_outlier(data)
|
7454
7564
|
outlier_num = data_outliers.isna().sum() - data.isnull().sum()
|
7455
7565
|
res_qc["outlier_num"] = outlier_num[outlier_num > 0]
|
7456
|
-
outlier_percentage=(outlier_num / len(data_outliers)) * 100
|
7566
|
+
outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
|
7457
7567
|
res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
|
7458
|
-
|
7459
|
-
|
7460
|
-
|
7461
|
-
|
7462
|
-
|
7463
|
-
|
7464
|
-
|
7465
|
-
|
7466
|
-
|
7467
|
-
|
7468
|
-
res_qc["high_correlations"] = high_corr_pairs
|
7469
|
-
|
7470
|
-
# VIF for multicollinearity check
|
7471
|
-
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7472
|
-
vif_data = pd.DataFrame()
|
7473
|
-
res_qc["vif"]=vif_data
|
7474
|
-
if numeric_df.shape[1] > 1 and not numeric_df.empty:
|
7475
|
-
vif_data["feature"] = numeric_df.columns
|
7476
|
-
vif_data["VIF"] = [
|
7477
|
-
variance_inflation_factor(numeric_df.values, i)
|
7478
|
-
for i in range(numeric_df.shape[1])
|
7568
|
+
try:
|
7569
|
+
# Correlation and multicollinearity (VIF)
|
7570
|
+
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7571
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7572
|
+
corr_matrix = numeric_df.corr()
|
7573
|
+
high_corr_pairs = [
|
7574
|
+
(col1, col2)
|
7575
|
+
for col1 in corr_matrix.columns
|
7576
|
+
for col2 in corr_matrix.columns
|
7577
|
+
if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
|
7479
7578
|
]
|
7480
|
-
res_qc["
|
7481
|
-
|
7482
|
-
|
7579
|
+
res_qc["high_correlations"] = high_corr_pairs
|
7580
|
+
|
7581
|
+
# VIF for multicollinearity check
|
7582
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7583
|
+
if isinstance(numeric_df.columns, pd.MultiIndex):
|
7584
|
+
numeric_df.columns = [
|
7585
|
+
"_".join(col).strip() if isinstance(col, tuple) else col for col in numeric_df.columns
|
7586
|
+
]
|
7587
|
+
|
7588
|
+
|
7589
|
+
vif_data = pd.DataFrame()
|
7590
|
+
res_qc["vif"]=vif_data
|
7591
|
+
if numeric_df.shape[1] > 1 and not numeric_df.empty:
|
7592
|
+
vif_data["feature"] = numeric_df.columns.tolist()
|
7593
|
+
vif_data["VIF"] = [
|
7594
|
+
round(variance_inflation_factor(numeric_df.values, i),2)
|
7595
|
+
for i in range(numeric_df.shape[1])
|
7596
|
+
]
|
7597
|
+
res_qc["vif"] = vif_data[
|
7598
|
+
vif_data["VIF"] > 5
|
7599
|
+
] # Typically VIF > 5 indicates multicollinearity
|
7600
|
+
except Exception as e:
|
7601
|
+
print(e)
|
7483
7602
|
# Skewness and Kurtosis
|
7484
7603
|
skewness = data.skew(numeric_only=True)
|
7485
7604
|
kurtosis_vals = data.kurt(numeric_only=True)
|
@@ -7492,8 +7611,7 @@ def df_qc(
|
|
7492
7611
|
col: entropy(data[col].value_counts(normalize=True), base=2)
|
7493
7612
|
for col in categorical_cols
|
7494
7613
|
}
|
7495
|
-
|
7496
|
-
res_qc["unique_counts"] = data.nunique()
|
7614
|
+
|
7497
7615
|
# dtypes counts
|
7498
7616
|
res_qc['dtype_counts']=data.dtypes.value_counts()
|
7499
7617
|
|
@@ -7540,7 +7658,7 @@ def df_qc(
|
|
7540
7658
|
res_qc["text_length_analysis"] = text_lengths
|
7541
7659
|
|
7542
7660
|
# Summary statistics
|
7543
|
-
res_qc["summary_statistics"] = data.describe().T
|
7661
|
+
res_qc["summary_statistics"] = data.describe().T.style.background_gradient(cmap='coolwarm', axis=0)
|
7544
7662
|
|
7545
7663
|
# Automated warnings
|
7546
7664
|
warnings = []
|
@@ -7562,28 +7680,45 @@ def df_qc(
|
|
7562
7680
|
|
7563
7681
|
# Report generation
|
7564
7682
|
if verbose:
|
7565
|
-
print("=== QC Report Summary ===")
|
7566
7683
|
print("\n⤵ Summary Statistics:")
|
7567
7684
|
display(res_qc["summary_statistics"])
|
7568
7685
|
print("\n⤵ Data Types:")
|
7569
7686
|
display(res_qc["data_types"])
|
7570
7687
|
if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
|
7571
7688
|
print(" ⤵ Missing Values Counts:")
|
7572
|
-
display(
|
7689
|
+
display(pd.DataFrame(
|
7690
|
+
{
|
7691
|
+
"missing_values": res_qc["missing_values"][res_qc["missing_values"] > 0],
|
7692
|
+
"missing_percent(%)": res_qc["missing_percentage"][
|
7693
|
+
res_qc["missing_percentage"] > 0
|
7694
|
+
],
|
7695
|
+
}
|
7696
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
7697
|
+
)
|
7573
7698
|
# print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
|
7574
7699
|
print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
|
7575
7700
|
|
7701
|
+
print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
|
7702
|
+
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
|
7703
|
+
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
|
7704
|
+
|
7576
7705
|
if any(res_qc["outlier_num"]):
|
7577
7706
|
print("\n⤵ Outlier Report:")
|
7578
|
-
display(
|
7579
|
-
|
7707
|
+
display(pd.DataFrame(
|
7708
|
+
{
|
7709
|
+
"outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
|
7710
|
+
"outlier_percentage(%)": res_qc["outlier_percentage"][
|
7711
|
+
res_qc["outlier_percentage"] > 0
|
7712
|
+
],
|
7713
|
+
}
|
7714
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
7715
|
+
)
|
7716
|
+
|
7717
|
+
if any(res_qc["unique_counts"]):
|
7580
7718
|
print("\n⤵ Unique Values per Column:")
|
7581
|
-
display(res_qc["
|
7719
|
+
display(pd.DataFrame({"unique_counts":res_qc["unique_counts"],
|
7720
|
+
"unique_values":res_qc["unique_values"]}).style.background_gradient(cmap="coolwarm", axis=0))
|
7582
7721
|
|
7583
|
-
print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
|
7584
|
-
|
7585
|
-
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
|
7586
|
-
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
|
7587
7722
|
|
7588
7723
|
if res_qc["empty_columns"]:
|
7589
7724
|
print("\n⤵ Empty Columns:", res_qc["empty_columns"])
|
@@ -7595,7 +7730,7 @@ def df_qc(
|
|
7595
7730
|
|
7596
7731
|
if "vif" in res_qc:
|
7597
7732
|
print("\n⤵ Features with High VIF (>|5|):")
|
7598
|
-
|
7733
|
+
display(res_qc["vif"].style.background_gradient(cmap="coolwarm", axis=0))
|
7599
7734
|
|
7600
7735
|
if any(res_qc["high_cardinality_categoricals"]):
|
7601
7736
|
print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
|
@@ -7614,6 +7749,8 @@ def df_qc(
|
|
7614
7749
|
print("\nWarnings:")
|
7615
7750
|
for warning in res_qc["warnings"]:
|
7616
7751
|
print(" -", warning)
|
7752
|
+
|
7753
|
+
pd.reset_option("display.max_seq_items")
|
7617
7754
|
if plot_:
|
7618
7755
|
df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
|
7619
7756
|
if output or not plot_:
|
@@ -7632,7 +7769,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7632
7769
|
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7633
7770
|
data=data[columns]
|
7634
7771
|
len_total = len(res_qc)
|
7635
|
-
n_row, n_col = int((len_total + 10)
|
7772
|
+
n_row, n_col = int((len_total + 10)), 3
|
7636
7773
|
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
7637
7774
|
|
7638
7775
|
missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
|
@@ -7789,8 +7926,10 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7789
7926
|
title="Dtypes",
|
7790
7927
|
ylabel="#",
|
7791
7928
|
ax=ax_dtype_counts,
|
7792
|
-
fontsize=8
|
7929
|
+
fontsize=8 if len(dtype_counts.index)<=20 else 6,
|
7793
7930
|
)
|
7931
|
+
# from .plot import pie
|
7932
|
+
# pie()
|
7794
7933
|
|
7795
7934
|
# High cardinality: Show top categorical columns by unique value count
|
7796
7935
|
high_cardinality = res_qc["high_cardinality_categoricals"]
|
@@ -7871,16 +8010,17 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7871
8010
|
title="Correlation Heatmap",
|
7872
8011
|
ax=ax_heatmap
|
7873
8012
|
)
|
7874
|
-
# save figure
|
7875
|
-
if dir_save:
|
7876
|
-
|
8013
|
+
# # save figure
|
8014
|
+
# if dir_save:
|
8015
|
+
# figsave(dir_save,f"qc_plot_{now_}.pdf")
|
7877
8016
|
|
7878
8017
|
if columns is not None:
|
7879
8018
|
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7880
8019
|
data=data[columns]
|
7881
|
-
|
7882
|
-
|
7883
|
-
|
8020
|
+
|
8021
|
+
# len_total = len(res_qc)
|
8022
|
+
# n_row, n_col = int((len_total + 10) / 3), 3
|
8023
|
+
# nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
7884
8024
|
#! check distribution
|
7885
8025
|
data_num = data.select_dtypes(include=np.number)
|
7886
8026
|
if len(data_num) > max_cols:
|
@@ -7907,7 +8047,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7907
8047
|
figsets(ylabel=f'Q-Q Plot:{column}',title=None)
|
7908
8048
|
# save figure
|
7909
8049
|
if dir_save:
|
7910
|
-
figsave(dir_save,f"
|
8050
|
+
figsave(dir_save,f"qc_plot_{now_}.pdf")
|
8051
|
+
|
8052
|
+
def df_corr(df: pd.DataFrame, method="pearson"):
|
8053
|
+
"""
|
8054
|
+
Compute correlation coefficients and p-values for a DataFrame.
|
8055
|
+
|
8056
|
+
Parameters:
|
8057
|
+
- df (pd.DataFrame): Input DataFrame with numeric data.
|
8058
|
+
- method (str): Correlation method ("pearson", "spearman", "kendall").
|
8059
|
+
|
8060
|
+
Returns:
|
8061
|
+
- corr_matrix (pd.DataFrame): Correlation coefficient matrix.
|
8062
|
+
- pval_matrix (pd.DataFrame): P-value matrix.
|
8063
|
+
"""
|
8064
|
+
from scipy.stats import pearsonr, spearmanr, kendalltau
|
8065
|
+
|
8066
|
+
methods = ["pearson", "spearman", "kendall"]
|
8067
|
+
method = strcmp(method, methods)[0]
|
8068
|
+
methods_dict = {"pearson": pearsonr, "spearman": spearmanr, "kendall": kendalltau}
|
8069
|
+
|
8070
|
+
cols = df.columns
|
8071
|
+
corr_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
|
8072
|
+
pval_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
|
8073
|
+
correlation_func = methods_dict[method]
|
8074
|
+
|
8075
|
+
for col1 in cols:
|
8076
|
+
for col2 in cols:
|
8077
|
+
if col1 == col2:
|
8078
|
+
corr_matrix.loc[col1, col2] = 1.0
|
8079
|
+
pval_matrix.loc[col1, col2] = 0.0
|
8080
|
+
else:
|
8081
|
+
corr, pval = correlation_func(df[col1], df[col2])
|
8082
|
+
corr_matrix.loc[col1, col2] = corr
|
8083
|
+
pval_matrix.loc[col1, col2] = pval
|
8084
|
+
|
8085
|
+
return corr_matrix, pval_matrix
|
8086
|
+
|
7911
8087
|
def use_pd(
|
7912
8088
|
func_name="excel",
|
7913
8089
|
verbose=True,
|
@@ -7927,3 +8103,135 @@ def use_pd(
|
|
7927
8103
|
except Exception as e:
|
7928
8104
|
if verbose:
|
7929
8105
|
print(e)
|
8106
|
+
|
8107
|
+
def get_phone(phone_number: str, region: str = None,verbose=True):
|
8108
|
+
"""
|
8109
|
+
usage:
|
8110
|
+
info = get_phone(15237654321, "DE")
|
8111
|
+
preview(info)
|
8112
|
+
|
8113
|
+
Extremely advanced phone number analysis function.
|
8114
|
+
|
8115
|
+
Args:
|
8116
|
+
phone_number (str): The phone number to analyze.
|
8117
|
+
region (str): None (Default). Tries to work with international numbers including country codes; otherwise, uses the specified region.
|
8118
|
+
|
8119
|
+
Returns:
|
8120
|
+
dict: Comprehensive information about the phone number.
|
8121
|
+
"""
|
8122
|
+
import phonenumbers
|
8123
|
+
from phonenumbers import geocoder, carrier, timezone, number_type
|
8124
|
+
from datetime import datetime
|
8125
|
+
import pytz
|
8126
|
+
from tzlocal import get_localzone
|
8127
|
+
|
8128
|
+
if not isinstance(phone_number, str):
|
8129
|
+
phone_number = str(phone_number)
|
8130
|
+
if isinstance(region, str):
|
8131
|
+
region = region.upper()
|
8132
|
+
|
8133
|
+
try:
|
8134
|
+
# Parse the phone number
|
8135
|
+
parsed_number = phonenumbers.parse(phone_number, region)
|
8136
|
+
|
8137
|
+
# Validate the phone number
|
8138
|
+
valid = phonenumbers.is_valid_number(parsed_number)
|
8139
|
+
possible = phonenumbers.is_possible_number(parsed_number)
|
8140
|
+
|
8141
|
+
if not valid:
|
8142
|
+
suggested_fix = phonenumbers.example_number(region) if region else "Unknown"
|
8143
|
+
return {
|
8144
|
+
"valid": False,
|
8145
|
+
"error": "Invalid phone number",
|
8146
|
+
"suggested_fix": suggested_fix,
|
8147
|
+
}
|
8148
|
+
|
8149
|
+
# Basic details
|
8150
|
+
formatted_international = phonenumbers.format_number(
|
8151
|
+
parsed_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL
|
8152
|
+
)
|
8153
|
+
formatted_national = phonenumbers.format_number(
|
8154
|
+
parsed_number, phonenumbers.PhoneNumberFormat.NATIONAL
|
8155
|
+
)
|
8156
|
+
formatted_e164 = phonenumbers.format_number(
|
8157
|
+
parsed_number, phonenumbers.PhoneNumberFormat.E164
|
8158
|
+
)
|
8159
|
+
country_code = parsed_number.country_code
|
8160
|
+
region_code = geocoder.region_code_for_number(parsed_number)
|
8161
|
+
country_name = geocoder.country_name_for_number(parsed_number, "en")
|
8162
|
+
|
8163
|
+
location = geocoder.description_for_number(parsed_number, "en")
|
8164
|
+
carrier_name = carrier.name_for_number(parsed_number, "en") or "Unknown Carrier"
|
8165
|
+
time_zones = timezone.time_zones_for_number(parsed_number)[0]
|
8166
|
+
current_times = datetime.now(pytz.timezone(time_zones)).strftime(
|
8167
|
+
"%Y-%m-%d %H:%M:%S %Z"
|
8168
|
+
)
|
8169
|
+
number_type_str = {
|
8170
|
+
phonenumbers.PhoneNumberType.FIXED_LINE: "Fixed Line",
|
8171
|
+
phonenumbers.PhoneNumberType.MOBILE: "Mobile",
|
8172
|
+
phonenumbers.PhoneNumberType.FIXED_LINE_OR_MOBILE: "Fixed Line or Mobile",
|
8173
|
+
phonenumbers.PhoneNumberType.TOLL_FREE: "Toll Free",
|
8174
|
+
phonenumbers.PhoneNumberType.PREMIUM_RATE: "Premium Rate",
|
8175
|
+
phonenumbers.PhoneNumberType.SHARED_COST: "Shared Cost",
|
8176
|
+
phonenumbers.PhoneNumberType.VOIP: "VOIP",
|
8177
|
+
phonenumbers.PhoneNumberType.PERSONAL_NUMBER: "Personal Number",
|
8178
|
+
phonenumbers.PhoneNumberType.PAGER: "Pager",
|
8179
|
+
phonenumbers.PhoneNumberType.UAN: "UAN",
|
8180
|
+
phonenumbers.PhoneNumberType.UNKNOWN: "Unknown",
|
8181
|
+
}.get(number_type(parsed_number), "Unknown")
|
8182
|
+
|
8183
|
+
# Advanced Features
|
8184
|
+
is_toll_free = (
|
8185
|
+
number_type(parsed_number) == phonenumbers.PhoneNumberType.TOLL_FREE
|
8186
|
+
)
|
8187
|
+
is_premium_rate = (
|
8188
|
+
number_type(parsed_number) == phonenumbers.PhoneNumberType.PREMIUM_RATE
|
8189
|
+
)
|
8190
|
+
|
8191
|
+
# Dialing Information
|
8192
|
+
dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
|
8193
|
+
|
8194
|
+
# Advanced Timezone Handling
|
8195
|
+
gmt_offsets = pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds()/ 3600
|
8196
|
+
# Get the local timezone (current computer's time)
|
8197
|
+
local_timezone = get_localzone()
|
8198
|
+
#local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
|
8199
|
+
local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
|
8200
|
+
offset_diff = local_offset - gmt_offsets
|
8201
|
+
head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
|
8202
|
+
res= {
|
8203
|
+
"valid": True,
|
8204
|
+
"possible": possible,
|
8205
|
+
"formatted": {
|
8206
|
+
"international": formatted_international,
|
8207
|
+
"national": formatted_national,
|
8208
|
+
"e164": formatted_e164,
|
8209
|
+
},
|
8210
|
+
"country_code": country_code,
|
8211
|
+
"country_name": country_name,
|
8212
|
+
"region_code": region_code,
|
8213
|
+
"location": location if location else "Unknown",
|
8214
|
+
"carrier": carrier_name,
|
8215
|
+
"time_zone": time_zones,
|
8216
|
+
"current_times": current_times,
|
8217
|
+
"local_offset":f"{local_offset} utcoffset",
|
8218
|
+
"time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
|
8219
|
+
"number_type": number_type_str,
|
8220
|
+
"is_toll_free": is_toll_free,
|
8221
|
+
"is_premium_rate": is_premium_rate,
|
8222
|
+
"dialing_instructions": dialing_instructions,
|
8223
|
+
"suggested_fix": None, # Use phonenumbers.example_number if invalid
|
8224
|
+
"logs": {
|
8225
|
+
"number_analysis_completed": datetime.now().strftime(
|
8226
|
+
"%Y-%m-%d %H:%M:%S"
|
8227
|
+
),
|
8228
|
+
"raw_input": phone_number,
|
8229
|
+
"parsed_number": str(parsed_number),
|
8230
|
+
},
|
8231
|
+
}
|
8232
|
+
|
8233
|
+
except phonenumbers.NumberParseException as e:
|
8234
|
+
res= {"valid": False, "error": str(e)}
|
8235
|
+
if verbose:
|
8236
|
+
preview(res)
|
8237
|
+
return res
|