py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/usages_sns.json +6 -1
- py2ls/ec2ls.py +61 -0
- py2ls/ips.py +496 -138
- py2ls/ml2ls.py +994 -288
- py2ls/netfinder.py +16 -20
- py2ls/nl2ls.py +283 -0
- py2ls/plot.py +1244 -158
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/METADATA +5 -1
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/RECORD +17 -14
- py2ls/data/usages_pd copy.json +0 -1105
- py2ls/ml2ls copy.py +0 -2906
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -16,17 +16,20 @@ import warnings
|
|
16
16
|
|
17
17
|
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
18
18
|
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
19
|
+
warnings.filterwarnings("ignore")
|
19
20
|
|
20
21
|
|
21
|
-
def run_once_within(duration=60): # default 60s
|
22
|
+
def run_once_within(duration=60,reverse=False): # default 60s
|
22
23
|
import time
|
23
24
|
|
24
25
|
"""
|
26
|
+
如果reverse is True, 则在第一次运行时并不运行.但是在第二次运行时则运行
|
25
27
|
usage:
|
26
28
|
if run_once_within():
|
27
29
|
print("This code runs once per minute.")
|
28
30
|
else:
|
29
31
|
print("The code has already been run in the last minute.")
|
32
|
+
|
30
33
|
"""
|
31
34
|
if not hasattr(run_once_within, "time_last"):
|
32
35
|
run_once_within.time_last = None
|
@@ -36,9 +39,9 @@ def run_once_within(duration=60): # default 60s
|
|
36
39
|
time_curr - run_once_within.time_last >= duration
|
37
40
|
):
|
38
41
|
run_once_within.time_last = time_curr # Update the last execution time
|
39
|
-
return True
|
42
|
+
return False if reverse else True
|
40
43
|
else:
|
41
|
-
return False
|
44
|
+
return True if reverse else False
|
42
45
|
|
43
46
|
|
44
47
|
def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
@@ -539,8 +542,7 @@ def is_text(s):
|
|
539
542
|
|
540
543
|
from typing import Any, Union
|
541
544
|
|
542
|
-
|
543
|
-
def shared(*args, strict=True, n_shared=2, verbose=True):
|
545
|
+
def share(*args, strict=True, n_shared=2, verbose=True):
|
544
546
|
"""
|
545
547
|
check the shared elelements in two list.
|
546
548
|
usage:
|
@@ -585,12 +587,80 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
|
|
585
587
|
elements2show = (
|
586
588
|
shared_elements if len(shared_elements) < 10 else shared_elements[:5]
|
587
589
|
)
|
590
|
+
tail = '' if len(shared_elements) < 10 else '......'
|
591
|
+
elements2show.append(tail)
|
588
592
|
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
589
593
|
print("********* checking shared elements *********")
|
590
594
|
return shared_elements
|
591
595
|
|
596
|
+
def shared(*args, n_shared=None, verbose=True,**kwargs):
|
597
|
+
"""
|
598
|
+
check the shared elelements in two list.
|
599
|
+
usage:
|
600
|
+
list1 = [1, 2, 3, 4, 5]
|
601
|
+
list2 = [4, 5, 6, 7, 8]
|
602
|
+
list3 = [5, 6, 9, 10]
|
603
|
+
a = shared(list1, list2,list3)
|
604
|
+
"""
|
605
|
+
if verbose:
|
606
|
+
print("\n********* checking shared elements *********")
|
607
|
+
|
608
|
+
if len(args) == 1 and isinstance(args[0], list):
|
609
|
+
lists = args[0] # Unpack the single list
|
610
|
+
else:
|
611
|
+
lists = args # Use the provided arguments as lists
|
612
|
+
flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
|
613
|
+
|
614
|
+
if n_shared is None:
|
615
|
+
n_shared = len(flattened_lists)
|
616
|
+
strict = True
|
617
|
+
else:
|
618
|
+
strict = False
|
619
|
+
# Ensure all arguments are lists
|
620
|
+
if any(not isinstance(lst, list) for lst in flattened_lists):
|
621
|
+
print(f"{' ' * 2}All inputs must be lists.")
|
622
|
+
return []
|
623
|
+
first_list = flattened_lists[0]
|
624
|
+
shared_elements = [
|
625
|
+
item for item in first_list if all(item in lst for lst in flattened_lists)
|
626
|
+
]
|
627
|
+
if strict:
|
628
|
+
# Strict mode: require elements to be in all lists
|
629
|
+
shared_elements = set(flattened_lists[0])
|
630
|
+
for lst in flattened_lists[1:]:
|
631
|
+
shared_elements.intersection_update(lst)
|
632
|
+
else:
|
633
|
+
from collections import Counter
|
634
|
+
|
635
|
+
all_elements = [item for sublist in flattened_lists for item in sublist]
|
636
|
+
element_count = Counter(all_elements)
|
637
|
+
# Get elements that appear in at least n_shared lists
|
638
|
+
shared_elements = [
|
639
|
+
item for item, count in element_count.items() if count >= n_shared
|
640
|
+
]
|
641
|
+
|
642
|
+
shared_elements = flatten(shared_elements, verbose=verbose)
|
643
|
+
if verbose:
|
644
|
+
elements2show = (
|
645
|
+
shared_elements if len(shared_elements) < 10 else shared_elements[:5]
|
646
|
+
)
|
647
|
+
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
648
|
+
print("********* checking shared elements *********")
|
649
|
+
return shared_elements
|
592
650
|
|
593
|
-
def
|
651
|
+
def share_not(*args, n_shared=None, verbose=False):
|
652
|
+
"""
|
653
|
+
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
654
|
+
usage:
|
655
|
+
list1 = [1, 8, 3, 3, 4, 5]
|
656
|
+
list2 = [4, 5, 6, 7, 8]
|
657
|
+
not_shared(list1,list2)# output [1,3]
|
658
|
+
"""
|
659
|
+
_common = shared(*args, n_shared=n_shared, verbose=verbose)
|
660
|
+
list1 = flatten(args[0], verbose=verbose)
|
661
|
+
_not_shared = [item for item in list1 if item not in _common]
|
662
|
+
return _not_shared
|
663
|
+
def not_shared(*args, n_shared=None, verbose=False):
|
594
664
|
"""
|
595
665
|
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
596
666
|
usage:
|
@@ -598,7 +668,7 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
|
|
598
668
|
list2 = [4, 5, 6, 7, 8]
|
599
669
|
not_shared(list1,list2)# output [1,3]
|
600
670
|
"""
|
601
|
-
_common = shared(*args,
|
671
|
+
_common = shared(*args, n_shared=n_shared, verbose=verbose)
|
602
672
|
list1 = flatten(args[0], verbose=verbose)
|
603
673
|
_not_shared = [item for item in list1 if item not in _common]
|
604
674
|
return _not_shared
|
@@ -1981,7 +2051,6 @@ def fload(fpath, kind=None, **kwargs):
|
|
1981
2051
|
|
1982
2052
|
def load_csv(fpath, **kwargs):
|
1983
2053
|
from pandas.errors import EmptyDataError
|
1984
|
-
|
1985
2054
|
engine = kwargs.pop("engine", "pyarrow")# default: None
|
1986
2055
|
sep = kwargs.pop("sep", None)# default: ','
|
1987
2056
|
index_col = kwargs.pop("index_col", None)# default: None
|
@@ -1992,13 +2061,20 @@ def fload(fpath, kind=None, **kwargs):
|
|
1992
2061
|
comment = kwargs.pop("comment", None)# default: None
|
1993
2062
|
fmt = kwargs.pop("fmt", False)# default:
|
1994
2063
|
chunksize = kwargs.pop("chunksize", None)# default: None
|
2064
|
+
|
2065
|
+
#check filesize
|
2066
|
+
f_size=round(os.path.getsize(fpath) / 1024 / 1024, 3)
|
2067
|
+
if f_size>=50: #50 MB
|
2068
|
+
if chunksize is None:
|
2069
|
+
chunksize = 5000
|
2070
|
+
print(f"file size is {f_size}MB, then set the chunksize with {chunksize}")
|
1995
2071
|
engine = "c" if chunksize else engine # when chunksize, recommend 'c'
|
1996
2072
|
low_memory = kwargs.pop("low_memory", True)# default: True
|
1997
2073
|
low_memory = (
|
1998
2074
|
False if chunksize else True
|
1999
2075
|
) # when chunksize, recommend low_memory=False # default:
|
2000
2076
|
verbose = kwargs.pop("verbose", False)
|
2001
|
-
if run_once_within():
|
2077
|
+
if run_once_within(reverse=True):
|
2002
2078
|
use_pd("read_csv", verbose=verbose)
|
2003
2079
|
|
2004
2080
|
if comment is None:# default: None
|
@@ -2174,7 +2250,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2174
2250
|
def load_excel(fpath, **kwargs):
|
2175
2251
|
engine = kwargs.get("engine", "openpyxl")
|
2176
2252
|
verbose = kwargs.pop("verbose", False)
|
2177
|
-
if run_once_within():
|
2253
|
+
if run_once_within(reverse=True):
|
2178
2254
|
use_pd("read_excel", verbose=verbose)
|
2179
2255
|
df = pd.read_excel(fpath, engine=engine, **kwargs)
|
2180
2256
|
try:
|
@@ -2204,7 +2280,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2204
2280
|
engine = kwargs.get("engine", "pyarrow")
|
2205
2281
|
verbose = kwargs.pop("verbose", False)
|
2206
2282
|
|
2207
|
-
if run_once_within():
|
2283
|
+
if run_once_within(reverse=True):
|
2208
2284
|
use_pd("read_parquet", verbose=verbose)
|
2209
2285
|
try:
|
2210
2286
|
df = pd.read_parquet(fpath, engine=engine, **kwargs)
|
@@ -2381,13 +2457,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
2381
2457
|
return load_xml(fpath)
|
2382
2458
|
elif kind in ["csv", "tsv"]:
|
2383
2459
|
# verbose = kwargs.pop("verbose", False)
|
2384
|
-
if run_once_within():
|
2460
|
+
if run_once_within(reverse=True):
|
2385
2461
|
use_pd("read_csv")
|
2386
2462
|
content = load_csv(fpath, **kwargs)
|
2387
2463
|
return content
|
2388
2464
|
elif kind == "pkl":
|
2389
2465
|
verbose = kwargs.pop("verbose", False)
|
2390
|
-
if run_once_within():
|
2466
|
+
if run_once_within(reverse=True):
|
2391
2467
|
use_pd("read_pickle")
|
2392
2468
|
return pd.read_pickle(fpath, **kwargs)
|
2393
2469
|
elif kind in ["ods", "ods", "odt"]:
|
@@ -2418,12 +2494,12 @@ def fload(fpath, kind=None, **kwargs):
|
|
2418
2494
|
return load_ipynb(fpath, **kwargs)
|
2419
2495
|
elif kind in ["parquet", "snappy"]:
|
2420
2496
|
verbose = kwargs.pop("verbose", False)
|
2421
|
-
if run_once_within():
|
2497
|
+
if run_once_within(reverse=True):
|
2422
2498
|
use_pd("read_parquet")
|
2423
2499
|
return load_parquet(fpath, **kwargs)
|
2424
2500
|
elif kind == "feather":
|
2425
2501
|
verbose = kwargs.pop("verbose", False)
|
2426
|
-
if run_once_within():
|
2502
|
+
if run_once_within(reverse=True):
|
2427
2503
|
use_pd("read_feather")
|
2428
2504
|
content = pd.read_feather(fpath, **kwargs)
|
2429
2505
|
return content
|
@@ -2682,7 +2758,7 @@ def fsave(
|
|
2682
2758
|
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
|
2683
2759
|
|
2684
2760
|
verbose = kwargs.pop("verbose", False)
|
2685
|
-
if run_once_within():
|
2761
|
+
if run_once_within(reverse=True):
|
2686
2762
|
use_pd("to_csv", verbose=verbose)
|
2687
2763
|
kwargs_csv = dict(
|
2688
2764
|
path_or_buf=None,
|
@@ -2714,7 +2790,7 @@ def fsave(
|
|
2714
2790
|
def save_xlsx(fpath, data, **kwargs):
|
2715
2791
|
verbose = kwargs.pop("verbose", False)
|
2716
2792
|
sheet_name = kwargs.pop("sheet_name", "Sheet1")
|
2717
|
-
if run_once_within():
|
2793
|
+
if run_once_within(reverse=True):
|
2718
2794
|
use_pd("to_excel", verbose=verbose)
|
2719
2795
|
if any(kwargs):
|
2720
2796
|
format_excel(df=data, filename=fpath, **kwargs)
|
@@ -3497,12 +3573,8 @@ def figsave(*args, dpi=300):
|
|
3497
3573
|
)
|
3498
3574
|
else:
|
3499
3575
|
plt.savefig(
|
3500
|
-
fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", pad_inches=0
|
3501
|
-
)
|
3502
|
-
# elif ftype.lower() == "png":
|
3503
|
-
# plt.savefig(fname, format="png", dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0)
|
3504
|
-
# elif ftype.lower() in ["tiff", "tif"]:
|
3505
|
-
# plt.savefig(fname, format="tiff", dpi=dpi, bbox_inches="tight",pad_inches=0)
|
3576
|
+
fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0
|
3577
|
+
)
|
3506
3578
|
elif ftype.lower() == "emf":
|
3507
3579
|
plt.savefig(fname, format="emf", dpi=dpi, bbox_inches="tight", pad_inches=0)
|
3508
3580
|
elif ftype.lower() == "fig":
|
@@ -5230,16 +5302,16 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
|
5230
5302
|
data = data.explode(column, ignore_index=True)
|
5231
5303
|
return data
|
5232
5304
|
|
5233
|
-
def
|
5305
|
+
def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
|
5234
5306
|
"""
|
5235
5307
|
Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
|
5236
5308
|
Usage:
|
5237
5309
|
data = pd.DataFrame({'month': [1, 4, 7, 10, 12]}) # Just months as an example
|
5238
|
-
#
|
5239
|
-
data =
|
5310
|
+
# df_cycle month cyclically
|
5311
|
+
data = df_cycle(data, 'month', 12)
|
5240
5312
|
"""
|
5241
5313
|
if columns is None:
|
5242
|
-
columns = list(data.columns) # If no columns specified, use all columns
|
5314
|
+
columns = list(data.select_dtypes(include=np.number).columns) # If no columns specified, use all columns
|
5243
5315
|
if max_val is None:
|
5244
5316
|
max_val = np.max(data[columns]) # If no max_val specified, use the maximum value across all columns
|
5245
5317
|
if isinstance(columns, str):
|
@@ -5424,7 +5496,7 @@ def df_astype(
|
|
5424
5496
|
# print(f"Successfully converted '{column}' to timedelta.")
|
5425
5497
|
elif astype == "circular":
|
5426
5498
|
max_val = kwargs.get('max_val',None)
|
5427
|
-
data[column]=
|
5499
|
+
data[column]=df_cycle(data=data,columns=column,max_val=max_val)
|
5428
5500
|
else:
|
5429
5501
|
# Convert to other types (e.g., float, int)
|
5430
5502
|
if astype=='int':
|
@@ -5910,11 +5982,16 @@ def df_encoder(
|
|
5910
5982
|
|
5911
5983
|
def df_scaler(
|
5912
5984
|
data: pd.DataFrame, # should be numeric dtype
|
5985
|
+
scaler=None,
|
5913
5986
|
method="standard",
|
5914
5987
|
columns=None, # default, select all numeric col/row
|
5988
|
+
feature_range=None,# specific for 'minmax'
|
5989
|
+
vmin=0,
|
5990
|
+
vmax=1,
|
5915
5991
|
inplace=False,
|
5916
5992
|
verbose=False, # show usage
|
5917
5993
|
axis=0, # defalut column-wise
|
5994
|
+
return_scaler:bool=False,# True: return both: return df, scaler
|
5918
5995
|
**kwargs,
|
5919
5996
|
):
|
5920
5997
|
"""
|
@@ -5932,31 +6009,51 @@ def df_scaler(
|
|
5932
6009
|
"""
|
5933
6010
|
if verbose:
|
5934
6011
|
print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
|
5935
|
-
|
5936
|
-
|
5937
|
-
|
5938
|
-
|
5939
|
-
|
5940
|
-
|
5941
|
-
|
5942
|
-
|
5943
|
-
|
5944
|
-
|
5945
|
-
|
5946
|
-
|
5947
|
-
|
5948
|
-
|
5949
|
-
|
5950
|
-
|
5951
|
-
|
5952
|
-
|
6012
|
+
if scaler is None:
|
6013
|
+
methods = ["standard", "minmax", "robust","maxabs"]
|
6014
|
+
method = strcmp(method, methods)[0]
|
6015
|
+
if method == "standard":
|
6016
|
+
from sklearn.preprocessing import StandardScaler
|
6017
|
+
if verbose:
|
6018
|
+
print("performs z-score normalization: This will standardize each feature to have a mean of 0 and a standard deviation of 1.")
|
6019
|
+
print("Use when the data is approximately normally distributed (Gaussian).\nWorks well with algorithms sensitive to feature distribution, such as SVMs, linear regression, logistic regression, and neural networks.")
|
6020
|
+
scaler = StandardScaler(**kwargs)
|
6021
|
+
elif method == "minmax":
|
6022
|
+
from sklearn.preprocessing import MinMaxScaler
|
6023
|
+
if feature_range is None:
|
6024
|
+
feature_range=(vmin,vmax)
|
6025
|
+
if verbose:
|
6026
|
+
print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
|
6027
|
+
print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
|
6028
|
+
print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
|
6029
|
+
scaler = MinMaxScaler(feature_range=feature_range,**kwargs)
|
6030
|
+
elif method == "robust":
|
6031
|
+
from sklearn.preprocessing import RobustScaler
|
6032
|
+
if verbose:
|
6033
|
+
print("scales the data based on the median and interquartile range, which is robust to outliers.")
|
6034
|
+
print("Use when the dataset contains outliers.\nThis method is useful because it scales based on the median and the interquartile range (IQR), which are more robust to outliers than the mean and standard deviation.")
|
6035
|
+
scaler = RobustScaler(**kwargs)
|
6036
|
+
elif method=="maxabs":
|
6037
|
+
from sklearn.preprocessing import MaxAbsScaler
|
6038
|
+
if verbose:
|
6039
|
+
print("This scales each feature by its maximum absolute value, resulting in values within the range [-1, 1] for each feature.")
|
6040
|
+
print("Use for data that is already sparse or when features have positive or negative values that need scaling without shifting the data.\nOften used with sparse data (data with many zeros), where preserving zero entries is essential, such as in text data or recommendation systems.")
|
6041
|
+
scaler = MaxAbsScaler(**kwargs)
|
6042
|
+
if axis not in [0, 1]:
|
6043
|
+
raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
|
6044
|
+
if verbose:
|
6045
|
+
print(scaler)
|
5953
6046
|
if axis == 0:
|
5954
6047
|
# Column-wise scaling (default)
|
5955
6048
|
if columns is None:
|
5956
6049
|
columns = data.select_dtypes(include=np.number).columns.tolist()
|
5957
6050
|
non_numeric_columns = data.columns.difference(columns)
|
5958
6051
|
|
5959
|
-
scaled_data = scaler.fit_transform(data[columns])
|
6052
|
+
# scaled_data = scaler.fit_transform(data[columns])
|
6053
|
+
if scaler is None or not hasattr(scaler, 'mean_'):
|
6054
|
+
scaled_data = scaler.fit_transform(data[columns])
|
6055
|
+
else:
|
6056
|
+
scaled_data = scaler.transform(data[columns])
|
5960
6057
|
|
5961
6058
|
if inplace:
|
5962
6059
|
data[columns] = scaled_data
|
@@ -5970,7 +6067,10 @@ def df_scaler(
|
|
5970
6067
|
axis=1,
|
5971
6068
|
)
|
5972
6069
|
scaled_df = scaled_df[data.columns] # Maintain column order
|
5973
|
-
|
6070
|
+
if return_scaler:
|
6071
|
+
return scaled_df,scaler
|
6072
|
+
else:
|
6073
|
+
return scaled_df
|
5974
6074
|
|
5975
6075
|
elif axis == 1:
|
5976
6076
|
# Row-wise scaling
|
@@ -5982,9 +6082,10 @@ def df_scaler(
|
|
5982
6082
|
|
5983
6083
|
print(f"Scaling rows")
|
5984
6084
|
|
5985
|
-
scaled_data = scaler.fit_transform(
|
5986
|
-
|
5987
|
-
).T # Transpose for scaling and then back
|
6085
|
+
# scaled_data = scaler.fit_transform(
|
6086
|
+
# numeric_rows.T
|
6087
|
+
# ).T # Transpose for scaling and then back
|
6088
|
+
scaled_data = scaler.fit_transform(numeric_rows.T).T if scaler is None or not hasattr(scaler, 'mean_') else scaler.transform(numeric_rows.T).T
|
5988
6089
|
|
5989
6090
|
if inplace:
|
5990
6091
|
data.loc[numeric_rows.index] = scaled_data
|
@@ -5992,7 +6093,10 @@ def df_scaler(
|
|
5992
6093
|
else:
|
5993
6094
|
scaled_df = data.copy()
|
5994
6095
|
scaled_df.loc[numeric_rows.index] = scaled_data
|
5995
|
-
|
6096
|
+
if return_scaler:
|
6097
|
+
return scaled_df,scaler
|
6098
|
+
else:
|
6099
|
+
return scaled_df
|
5996
6100
|
|
5997
6101
|
|
5998
6102
|
def df_special_characters_cleaner(
|
@@ -6010,15 +6114,20 @@ def df_special_characters_cleaner(
|
|
6010
6114
|
|
6011
6115
|
# 1. Clean column names by replacing special characters with underscores
|
6012
6116
|
if "column" in where_:
|
6013
|
-
|
6117
|
+
try:
|
6118
|
+
data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
|
6119
|
+
except Exception as e:
|
6120
|
+
print(e)
|
6014
6121
|
|
6015
6122
|
# 2. Clean only object-type columns (text columns)
|
6016
|
-
|
6017
|
-
|
6018
|
-
|
6019
|
-
|
6020
|
-
|
6021
|
-
|
6123
|
+
try:
|
6124
|
+
if "content" in where_:
|
6125
|
+
for col in data.select_dtypes(include=["object"]).columns:
|
6126
|
+
data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
|
6127
|
+
if data.index.dtype == "object" and index in where_:
|
6128
|
+
data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
|
6129
|
+
except:
|
6130
|
+
pass
|
6022
6131
|
return data
|
6023
6132
|
|
6024
6133
|
|
@@ -6401,6 +6510,9 @@ def df_reducer(
|
|
6401
6510
|
# "autoencoder","nmf",
|
6402
6511
|
]
|
6403
6512
|
method = strcmp(method, methods)[0]
|
6513
|
+
if run_once_within(reverse=True):
|
6514
|
+
print(f"support methods:{methods}")
|
6515
|
+
|
6404
6516
|
if verbose:
|
6405
6517
|
print(f"\nprocessing with using {dict_methods[method]}:")
|
6406
6518
|
xlabel, ylabel = None, None
|
@@ -6408,16 +6520,20 @@ def df_reducer(
|
|
6408
6520
|
columns = data.select_dtypes(include="number").columns.tolist()
|
6409
6521
|
if hue is None:
|
6410
6522
|
hue = data.select_dtypes(exclude="number").columns.tolist()
|
6523
|
+
print(f"auto select the non-number as 'hue':{hue}")
|
6411
6524
|
if isinstance(hue, list):
|
6412
6525
|
print("Warning: hue is a list, only select the 1st one")
|
6413
6526
|
hue = hue[0]
|
6414
|
-
if not hue:
|
6527
|
+
if not any(hue):
|
6415
6528
|
# Select columns if specified, else use all columns
|
6416
6529
|
X = data[columns].values if columns else data.values
|
6417
6530
|
else:
|
6418
6531
|
# Select columns to reduce and hue for LDA
|
6419
|
-
|
6420
|
-
|
6532
|
+
try:
|
6533
|
+
X = data[columns].values if columns else data.drop(columns=[hue]).values
|
6534
|
+
y = data[hue].values
|
6535
|
+
except:
|
6536
|
+
pass
|
6421
6537
|
print(X.shape)
|
6422
6538
|
# Handle missing values
|
6423
6539
|
if fill_missing:
|
@@ -6884,33 +7000,49 @@ def df_reducer(
|
|
6884
7000
|
colname_met = "SVD_"
|
6885
7001
|
# Quick plots
|
6886
7002
|
if plot_ and (not method in ["isolation_forest"]):
|
6887
|
-
from .plot import plotxy
|
6888
|
-
if ax is None:
|
6889
|
-
|
6890
|
-
|
6891
|
-
|
6892
|
-
|
6893
|
-
else:
|
6894
|
-
|
7003
|
+
from .plot import plotxy,figsets,get_color
|
7004
|
+
# if ax is None:
|
7005
|
+
# if figsize is None:
|
7006
|
+
# _, ax = plt.subplots(figsize=cm2inch(8, 8))
|
7007
|
+
# else:
|
7008
|
+
# _, ax = plt.subplots(figsize=figsize)
|
7009
|
+
# else:
|
7010
|
+
# ax = ax.cla()
|
6895
7011
|
xlabel = f"{colname_met}1" if xlabel is None else xlabel
|
6896
7012
|
ylabel = f"{colname_met}2" if ylabel is None else ylabel
|
7013
|
+
palette=get_color(len(flatten(data[hue],verbose=0)))
|
7014
|
+
|
7015
|
+
reduced_df=reduced_df.sort_values(by=hue)
|
7016
|
+
print(flatten(reduced_df[hue]))
|
6897
7017
|
ax = plotxy(
|
6898
7018
|
data=reduced_df,
|
6899
7019
|
x=colname_met + "1",
|
6900
7020
|
y=colname_met + "2",
|
6901
7021
|
hue=hue,
|
6902
|
-
|
7022
|
+
palette=palette,
|
7023
|
+
# size=size,
|
6903
7024
|
edgecolor=edgecolor,
|
6904
|
-
kind_="
|
6905
|
-
|
6906
|
-
|
6907
|
-
|
6908
|
-
|
6909
|
-
|
6910
|
-
|
7025
|
+
kind_=["joint",
|
7026
|
+
# "kde",
|
7027
|
+
"ell",
|
7028
|
+
],
|
7029
|
+
kws_kde=dict(
|
7030
|
+
hue=hue,
|
7031
|
+
levels=2,
|
7032
|
+
common_norm=False,
|
7033
|
+
fill=True,
|
7034
|
+
alpha=0.05,
|
7035
|
+
),
|
7036
|
+
kws_joint=dict(kind='scatter',joint_kws=dict(s=size)),
|
7037
|
+
kws_ellipse=dict(alpha=0.1,lw=1,label=None),
|
6911
7038
|
verbose=False,
|
6912
7039
|
**kwargs,
|
6913
7040
|
)
|
7041
|
+
figsets(
|
7042
|
+
legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
|
7043
|
+
xlabel=xlabel if xlabel else None,
|
7044
|
+
ylabel=ylabel if ylabel else None,
|
7045
|
+
)
|
6914
7046
|
|
6915
7047
|
if inplace:
|
6916
7048
|
# If inplace=True, add components back into the original data
|
@@ -7387,6 +7519,7 @@ def df_qc(
|
|
7387
7519
|
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
7388
7520
|
from scipy.stats import skew, kurtosis, entropy
|
7389
7521
|
|
7522
|
+
pd.options.display.max_seq_items = 10
|
7390
7523
|
#! display(data.select_dtypes(include=[np.number]).describe())
|
7391
7524
|
#!skim
|
7392
7525
|
if columns is not None:
|
@@ -7403,16 +7536,18 @@ def df_qc(
|
|
7403
7536
|
data = data.copy()
|
7404
7537
|
data.loc[:, data.isna().all()] = 0
|
7405
7538
|
res_qc = {}
|
7406
|
-
print(f"data.shape:{data.shape}")
|
7539
|
+
print(f"⤵ data.shape:{data.shape}\n⤵ data.sample(10):")
|
7540
|
+
display(data.sample(10).style.background_gradient(cmap="coolwarm", axis=1))
|
7407
7541
|
|
7408
7542
|
# Missing values
|
7409
7543
|
res_qc["missing_values"] = data.isnull().sum()
|
7410
|
-
res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
|
7544
|
+
res_qc["missing_percentage"] = round((res_qc["missing_values"] / len(data)) * 100,2)
|
7411
7545
|
res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
|
7412
7546
|
|
7413
7547
|
# Data types and unique values
|
7414
7548
|
res_qc["data_types"] = data.dtypes
|
7415
|
-
res_qc["
|
7549
|
+
res_qc["unique_counts"] = data.select_dtypes(exclude=np.number).nunique().sort_values()
|
7550
|
+
res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(lambda x: x.unique())
|
7416
7551
|
res_qc["constant_columns"] = [
|
7417
7552
|
col for col in data.columns if data[col].nunique() <= 1
|
7418
7553
|
]
|
@@ -7428,33 +7563,42 @@ def df_qc(
|
|
7428
7563
|
data_outliers = df_outlier(data)
|
7429
7564
|
outlier_num = data_outliers.isna().sum() - data.isnull().sum()
|
7430
7565
|
res_qc["outlier_num"] = outlier_num[outlier_num > 0]
|
7431
|
-
outlier_percentage=(outlier_num / len(data_outliers)) * 100
|
7566
|
+
outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
|
7432
7567
|
res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
|
7433
|
-
|
7434
|
-
|
7435
|
-
|
7436
|
-
|
7437
|
-
|
7438
|
-
|
7439
|
-
|
7440
|
-
|
7441
|
-
|
7442
|
-
|
7443
|
-
res_qc["high_correlations"] = high_corr_pairs
|
7444
|
-
|
7445
|
-
# VIF for multicollinearity check
|
7446
|
-
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7447
|
-
vif_data = pd.DataFrame()
|
7448
|
-
res_qc["vif"]=vif_data
|
7449
|
-
if numeric_df.shape[1] > 1 and not numeric_df.empty:
|
7450
|
-
vif_data["feature"] = numeric_df.columns
|
7451
|
-
vif_data["VIF"] = [
|
7452
|
-
variance_inflation_factor(numeric_df.values, i)
|
7453
|
-
for i in range(numeric_df.shape[1])
|
7568
|
+
try:
|
7569
|
+
# Correlation and multicollinearity (VIF)
|
7570
|
+
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7571
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7572
|
+
corr_matrix = numeric_df.corr()
|
7573
|
+
high_corr_pairs = [
|
7574
|
+
(col1, col2)
|
7575
|
+
for col1 in corr_matrix.columns
|
7576
|
+
for col2 in corr_matrix.columns
|
7577
|
+
if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
|
7454
7578
|
]
|
7455
|
-
res_qc["
|
7456
|
-
|
7457
|
-
|
7579
|
+
res_qc["high_correlations"] = high_corr_pairs
|
7580
|
+
|
7581
|
+
# VIF for multicollinearity check
|
7582
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7583
|
+
if isinstance(numeric_df.columns, pd.MultiIndex):
|
7584
|
+
numeric_df.columns = [
|
7585
|
+
"_".join(col).strip() if isinstance(col, tuple) else col for col in numeric_df.columns
|
7586
|
+
]
|
7587
|
+
|
7588
|
+
|
7589
|
+
vif_data = pd.DataFrame()
|
7590
|
+
res_qc["vif"]=vif_data
|
7591
|
+
if numeric_df.shape[1] > 1 and not numeric_df.empty:
|
7592
|
+
vif_data["feature"] = numeric_df.columns.tolist()
|
7593
|
+
vif_data["VIF"] = [
|
7594
|
+
round(variance_inflation_factor(numeric_df.values, i),2)
|
7595
|
+
for i in range(numeric_df.shape[1])
|
7596
|
+
]
|
7597
|
+
res_qc["vif"] = vif_data[
|
7598
|
+
vif_data["VIF"] > 5
|
7599
|
+
] # Typically VIF > 5 indicates multicollinearity
|
7600
|
+
except Exception as e:
|
7601
|
+
print(e)
|
7458
7602
|
# Skewness and Kurtosis
|
7459
7603
|
skewness = data.skew(numeric_only=True)
|
7460
7604
|
kurtosis_vals = data.kurt(numeric_only=True)
|
@@ -7467,8 +7611,7 @@ def df_qc(
|
|
7467
7611
|
col: entropy(data[col].value_counts(normalize=True), base=2)
|
7468
7612
|
for col in categorical_cols
|
7469
7613
|
}
|
7470
|
-
|
7471
|
-
res_qc["unique_counts"] = data.nunique()
|
7614
|
+
|
7472
7615
|
# dtypes counts
|
7473
7616
|
res_qc['dtype_counts']=data.dtypes.value_counts()
|
7474
7617
|
|
@@ -7515,7 +7658,7 @@ def df_qc(
|
|
7515
7658
|
res_qc["text_length_analysis"] = text_lengths
|
7516
7659
|
|
7517
7660
|
# Summary statistics
|
7518
|
-
res_qc["summary_statistics"] = data.describe().T
|
7661
|
+
res_qc["summary_statistics"] = data.describe().T.style.background_gradient(cmap='coolwarm', axis=0)
|
7519
7662
|
|
7520
7663
|
# Automated warnings
|
7521
7664
|
warnings = []
|
@@ -7537,28 +7680,45 @@ def df_qc(
|
|
7537
7680
|
|
7538
7681
|
# Report generation
|
7539
7682
|
if verbose:
|
7540
|
-
print("=== QC Report Summary ===")
|
7541
7683
|
print("\n⤵ Summary Statistics:")
|
7542
7684
|
display(res_qc["summary_statistics"])
|
7543
7685
|
print("\n⤵ Data Types:")
|
7544
7686
|
display(res_qc["data_types"])
|
7545
7687
|
if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
|
7546
7688
|
print(" ⤵ Missing Values Counts:")
|
7547
|
-
display(
|
7689
|
+
display(pd.DataFrame(
|
7690
|
+
{
|
7691
|
+
"missing_values": res_qc["missing_values"][res_qc["missing_values"] > 0],
|
7692
|
+
"missing_percent(%)": res_qc["missing_percentage"][
|
7693
|
+
res_qc["missing_percentage"] > 0
|
7694
|
+
],
|
7695
|
+
}
|
7696
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
7697
|
+
)
|
7548
7698
|
# print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
|
7549
7699
|
print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
|
7550
7700
|
|
7701
|
+
print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
|
7702
|
+
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
|
7703
|
+
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
|
7704
|
+
|
7551
7705
|
if any(res_qc["outlier_num"]):
|
7552
7706
|
print("\n⤵ Outlier Report:")
|
7553
|
-
display(
|
7554
|
-
|
7555
|
-
|
7556
|
-
|
7707
|
+
display(pd.DataFrame(
|
7708
|
+
{
|
7709
|
+
"outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
|
7710
|
+
"outlier_percentage(%)": res_qc["outlier_percentage"][
|
7711
|
+
res_qc["outlier_percentage"] > 0
|
7712
|
+
],
|
7713
|
+
}
|
7714
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
7715
|
+
)
|
7557
7716
|
|
7558
|
-
|
7717
|
+
if any(res_qc["unique_counts"]):
|
7718
|
+
print("\n⤵ Unique Values per Column:")
|
7719
|
+
display(pd.DataFrame({"unique_counts":res_qc["unique_counts"],
|
7720
|
+
"unique_values":res_qc["unique_values"]}).style.background_gradient(cmap="coolwarm", axis=0))
|
7559
7721
|
|
7560
|
-
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
|
7561
|
-
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
|
7562
7722
|
|
7563
7723
|
if res_qc["empty_columns"]:
|
7564
7724
|
print("\n⤵ Empty Columns:", res_qc["empty_columns"])
|
@@ -7570,7 +7730,7 @@ def df_qc(
|
|
7570
7730
|
|
7571
7731
|
if "vif" in res_qc:
|
7572
7732
|
print("\n⤵ Features with High VIF (>|5|):")
|
7573
|
-
|
7733
|
+
display(res_qc["vif"].style.background_gradient(cmap="coolwarm", axis=0))
|
7574
7734
|
|
7575
7735
|
if any(res_qc["high_cardinality_categoricals"]):
|
7576
7736
|
print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
|
@@ -7589,28 +7749,27 @@ def df_qc(
|
|
7589
7749
|
print("\nWarnings:")
|
7590
7750
|
for warning in res_qc["warnings"]:
|
7591
7751
|
print(" -", warning)
|
7752
|
+
|
7753
|
+
pd.reset_option("display.max_seq_items")
|
7592
7754
|
if plot_:
|
7593
|
-
df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue)
|
7594
|
-
|
7595
|
-
try:
|
7596
|
-
figsave(dir_save)
|
7597
|
-
except Exception as e:
|
7598
|
-
print(f"⚠️: {e}")
|
7599
|
-
if output:
|
7755
|
+
df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
|
7756
|
+
if output or not plot_:
|
7600
7757
|
return res_qc
|
7601
7758
|
return None
|
7602
7759
|
|
7603
7760
|
|
7604
|
-
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None):
|
7761
|
+
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None,dir_save=None):
|
7605
7762
|
import matplotlib.pyplot as plt
|
7606
7763
|
import seaborn as sns
|
7607
7764
|
from .plot import subplot, figsets, get_color
|
7765
|
+
from datetime import datetime
|
7766
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
7608
7767
|
|
7609
7768
|
if columns is not None:
|
7610
7769
|
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7611
7770
|
data=data[columns]
|
7612
7771
|
len_total = len(res_qc)
|
7613
|
-
n_row, n_col = int((len_total + 10)
|
7772
|
+
n_row, n_col = int((len_total + 10)), 3
|
7614
7773
|
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
7615
7774
|
|
7616
7775
|
missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
|
@@ -7638,15 +7797,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7638
7797
|
ax=nexttile(),
|
7639
7798
|
)
|
7640
7799
|
figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
|
7641
|
-
|
7642
|
-
#!
|
7643
|
-
try:
|
7644
|
-
if data.select_dtypes(include=np.number).shape[1]<=10:
|
7645
|
-
for col in data.select_dtypes(include=np.number).columns:
|
7646
|
-
sns.histplot(data[col], kde=True, bins=30, ax=nexttile())
|
7647
|
-
figsets(title=f"Distribution: {col}", xlabel=col, ylabel="Frequency")
|
7648
|
-
except:
|
7649
|
-
pass
|
7800
|
+
|
7650
7801
|
#!
|
7651
7802
|
try:
|
7652
7803
|
for col in data.select_dtypes(include='category').columns:
|
@@ -7775,8 +7926,10 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7775
7926
|
title="Dtypes",
|
7776
7927
|
ylabel="#",
|
7777
7928
|
ax=ax_dtype_counts,
|
7778
|
-
fontsize=8
|
7929
|
+
fontsize=8 if len(dtype_counts.index)<=20 else 6,
|
7779
7930
|
)
|
7931
|
+
# from .plot import pie
|
7932
|
+
# pie()
|
7780
7933
|
|
7781
7934
|
# High cardinality: Show top categorical columns by unique value count
|
7782
7935
|
high_cardinality = res_qc["high_cardinality_categoricals"]
|
@@ -7857,6 +8010,79 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7857
8010
|
title="Correlation Heatmap",
|
7858
8011
|
ax=ax_heatmap
|
7859
8012
|
)
|
8013
|
+
# # save figure
|
8014
|
+
# if dir_save:
|
8015
|
+
# figsave(dir_save,f"qc_plot_{now_}.pdf")
|
8016
|
+
|
8017
|
+
if columns is not None:
|
8018
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
8019
|
+
data=data[columns]
|
8020
|
+
|
8021
|
+
# len_total = len(res_qc)
|
8022
|
+
# n_row, n_col = int((len_total + 10) / 3), 3
|
8023
|
+
# nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
8024
|
+
#! check distribution
|
8025
|
+
data_num = data.select_dtypes(include=np.number)
|
8026
|
+
if len(data_num) > max_cols:
|
8027
|
+
data_num = data_num.iloc[:,:max_cols]
|
8028
|
+
|
8029
|
+
data_num = df_scaler(data=data_num, method='standard')
|
8030
|
+
|
8031
|
+
import scipy.stats as stats
|
8032
|
+
for column in data_num.columns:
|
8033
|
+
#* Shapiro-Wilk test for normality
|
8034
|
+
stat, p_value = stats.shapiro(data_num[column])
|
8035
|
+
normality = "norm" if p_value > 0.05 else "not_norm"
|
8036
|
+
#* Plot histogram
|
8037
|
+
ax_hist=sns.histplot(data_num[column], kde=True, ax=nexttile())
|
8038
|
+
x_min, x_max = ax_hist.get_xlim()
|
8039
|
+
y_min, y_max = ax_hist.get_ylim()
|
8040
|
+
ax_hist.text(x_min+(x_max-x_min)*0.5, y_min+(y_max-y_min)*0.75,
|
8041
|
+
f'p(Shapiro-Wilk)={p_value:.3f}\n{normality}',
|
8042
|
+
ha='center', va='top')
|
8043
|
+
figsets(title=column,ax=ax_hist)
|
8044
|
+
ax_twin=ax_hist.twinx()
|
8045
|
+
#* Q-Q plot
|
8046
|
+
stats.probplot(data_num[column], dist="norm", plot=ax_twin)
|
8047
|
+
figsets(ylabel=f'Q-Q Plot:{column}',title=None)
|
8048
|
+
# save figure
|
8049
|
+
if dir_save:
|
8050
|
+
figsave(dir_save,f"qc_plot_{now_}.pdf")
|
8051
|
+
|
8052
|
+
def df_corr(df: pd.DataFrame, method="pearson"):
|
8053
|
+
"""
|
8054
|
+
Compute correlation coefficients and p-values for a DataFrame.
|
8055
|
+
|
8056
|
+
Parameters:
|
8057
|
+
- df (pd.DataFrame): Input DataFrame with numeric data.
|
8058
|
+
- method (str): Correlation method ("pearson", "spearman", "kendall").
|
8059
|
+
|
8060
|
+
Returns:
|
8061
|
+
- corr_matrix (pd.DataFrame): Correlation coefficient matrix.
|
8062
|
+
- pval_matrix (pd.DataFrame): P-value matrix.
|
8063
|
+
"""
|
8064
|
+
from scipy.stats import pearsonr, spearmanr, kendalltau
|
8065
|
+
|
8066
|
+
methods = ["pearson", "spearman", "kendall"]
|
8067
|
+
method = strcmp(method, methods)[0]
|
8068
|
+
methods_dict = {"pearson": pearsonr, "spearman": spearmanr, "kendall": kendalltau}
|
8069
|
+
|
8070
|
+
cols = df.columns
|
8071
|
+
corr_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
|
8072
|
+
pval_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
|
8073
|
+
correlation_func = methods_dict[method]
|
8074
|
+
|
8075
|
+
for col1 in cols:
|
8076
|
+
for col2 in cols:
|
8077
|
+
if col1 == col2:
|
8078
|
+
corr_matrix.loc[col1, col2] = 1.0
|
8079
|
+
pval_matrix.loc[col1, col2] = 0.0
|
8080
|
+
else:
|
8081
|
+
corr, pval = correlation_func(df[col1], df[col2])
|
8082
|
+
corr_matrix.loc[col1, col2] = corr
|
8083
|
+
pval_matrix.loc[col1, col2] = pval
|
8084
|
+
|
8085
|
+
return corr_matrix, pval_matrix
|
7860
8086
|
|
7861
8087
|
def use_pd(
|
7862
8088
|
func_name="excel",
|
@@ -7877,3 +8103,135 @@ def use_pd(
|
|
7877
8103
|
except Exception as e:
|
7878
8104
|
if verbose:
|
7879
8105
|
print(e)
|
8106
|
+
|
8107
|
+
def get_phone(phone_number: str, region: str = None,verbose=True):
|
8108
|
+
"""
|
8109
|
+
usage:
|
8110
|
+
info = get_phone(15237654321, "DE")
|
8111
|
+
preview(info)
|
8112
|
+
|
8113
|
+
Extremely advanced phone number analysis function.
|
8114
|
+
|
8115
|
+
Args:
|
8116
|
+
phone_number (str): The phone number to analyze.
|
8117
|
+
region (str): None (Default). Tries to work with international numbers including country codes; otherwise, uses the specified region.
|
8118
|
+
|
8119
|
+
Returns:
|
8120
|
+
dict: Comprehensive information about the phone number.
|
8121
|
+
"""
|
8122
|
+
import phonenumbers
|
8123
|
+
from phonenumbers import geocoder, carrier, timezone, number_type
|
8124
|
+
from datetime import datetime
|
8125
|
+
import pytz
|
8126
|
+
from tzlocal import get_localzone
|
8127
|
+
|
8128
|
+
if not isinstance(phone_number, str):
|
8129
|
+
phone_number = str(phone_number)
|
8130
|
+
if isinstance(region, str):
|
8131
|
+
region = region.upper()
|
8132
|
+
|
8133
|
+
try:
|
8134
|
+
# Parse the phone number
|
8135
|
+
parsed_number = phonenumbers.parse(phone_number, region)
|
8136
|
+
|
8137
|
+
# Validate the phone number
|
8138
|
+
valid = phonenumbers.is_valid_number(parsed_number)
|
8139
|
+
possible = phonenumbers.is_possible_number(parsed_number)
|
8140
|
+
|
8141
|
+
if not valid:
|
8142
|
+
suggested_fix = phonenumbers.example_number(region) if region else "Unknown"
|
8143
|
+
return {
|
8144
|
+
"valid": False,
|
8145
|
+
"error": "Invalid phone number",
|
8146
|
+
"suggested_fix": suggested_fix,
|
8147
|
+
}
|
8148
|
+
|
8149
|
+
# Basic details
|
8150
|
+
formatted_international = phonenumbers.format_number(
|
8151
|
+
parsed_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL
|
8152
|
+
)
|
8153
|
+
formatted_national = phonenumbers.format_number(
|
8154
|
+
parsed_number, phonenumbers.PhoneNumberFormat.NATIONAL
|
8155
|
+
)
|
8156
|
+
formatted_e164 = phonenumbers.format_number(
|
8157
|
+
parsed_number, phonenumbers.PhoneNumberFormat.E164
|
8158
|
+
)
|
8159
|
+
country_code = parsed_number.country_code
|
8160
|
+
region_code = geocoder.region_code_for_number(parsed_number)
|
8161
|
+
country_name = geocoder.country_name_for_number(parsed_number, "en")
|
8162
|
+
|
8163
|
+
location = geocoder.description_for_number(parsed_number, "en")
|
8164
|
+
carrier_name = carrier.name_for_number(parsed_number, "en") or "Unknown Carrier"
|
8165
|
+
time_zones = timezone.time_zones_for_number(parsed_number)[0]
|
8166
|
+
current_times = datetime.now(pytz.timezone(time_zones)).strftime(
|
8167
|
+
"%Y-%m-%d %H:%M:%S %Z"
|
8168
|
+
)
|
8169
|
+
number_type_str = {
|
8170
|
+
phonenumbers.PhoneNumberType.FIXED_LINE: "Fixed Line",
|
8171
|
+
phonenumbers.PhoneNumberType.MOBILE: "Mobile",
|
8172
|
+
phonenumbers.PhoneNumberType.FIXED_LINE_OR_MOBILE: "Fixed Line or Mobile",
|
8173
|
+
phonenumbers.PhoneNumberType.TOLL_FREE: "Toll Free",
|
8174
|
+
phonenumbers.PhoneNumberType.PREMIUM_RATE: "Premium Rate",
|
8175
|
+
phonenumbers.PhoneNumberType.SHARED_COST: "Shared Cost",
|
8176
|
+
phonenumbers.PhoneNumberType.VOIP: "VOIP",
|
8177
|
+
phonenumbers.PhoneNumberType.PERSONAL_NUMBER: "Personal Number",
|
8178
|
+
phonenumbers.PhoneNumberType.PAGER: "Pager",
|
8179
|
+
phonenumbers.PhoneNumberType.UAN: "UAN",
|
8180
|
+
phonenumbers.PhoneNumberType.UNKNOWN: "Unknown",
|
8181
|
+
}.get(number_type(parsed_number), "Unknown")
|
8182
|
+
|
8183
|
+
# Advanced Features
|
8184
|
+
is_toll_free = (
|
8185
|
+
number_type(parsed_number) == phonenumbers.PhoneNumberType.TOLL_FREE
|
8186
|
+
)
|
8187
|
+
is_premium_rate = (
|
8188
|
+
number_type(parsed_number) == phonenumbers.PhoneNumberType.PREMIUM_RATE
|
8189
|
+
)
|
8190
|
+
|
8191
|
+
# Dialing Information
|
8192
|
+
dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
|
8193
|
+
|
8194
|
+
# Advanced Timezone Handling
|
8195
|
+
gmt_offsets = pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds()/ 3600
|
8196
|
+
# Get the local timezone (current computer's time)
|
8197
|
+
local_timezone = get_localzone()
|
8198
|
+
#local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
|
8199
|
+
local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
|
8200
|
+
offset_diff = local_offset - gmt_offsets
|
8201
|
+
head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
|
8202
|
+
res= {
|
8203
|
+
"valid": True,
|
8204
|
+
"possible": possible,
|
8205
|
+
"formatted": {
|
8206
|
+
"international": formatted_international,
|
8207
|
+
"national": formatted_national,
|
8208
|
+
"e164": formatted_e164,
|
8209
|
+
},
|
8210
|
+
"country_code": country_code,
|
8211
|
+
"country_name": country_name,
|
8212
|
+
"region_code": region_code,
|
8213
|
+
"location": location if location else "Unknown",
|
8214
|
+
"carrier": carrier_name,
|
8215
|
+
"time_zone": time_zones,
|
8216
|
+
"current_times": current_times,
|
8217
|
+
"local_offset":f"{local_offset} utcoffset",
|
8218
|
+
"time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
|
8219
|
+
"number_type": number_type_str,
|
8220
|
+
"is_toll_free": is_toll_free,
|
8221
|
+
"is_premium_rate": is_premium_rate,
|
8222
|
+
"dialing_instructions": dialing_instructions,
|
8223
|
+
"suggested_fix": None, # Use phonenumbers.example_number if invalid
|
8224
|
+
"logs": {
|
8225
|
+
"number_analysis_completed": datetime.now().strftime(
|
8226
|
+
"%Y-%m-%d %H:%M:%S"
|
8227
|
+
),
|
8228
|
+
"raw_input": phone_number,
|
8229
|
+
"parsed_number": str(parsed_number),
|
8230
|
+
},
|
8231
|
+
}
|
8232
|
+
|
8233
|
+
except phonenumbers.NumberParseException as e:
|
8234
|
+
res= {"valid": False, "error": str(e)}
|
8235
|
+
if verbose:
|
8236
|
+
preview(res)
|
8237
|
+
return res
|