py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.25__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.git/index +0 -0
- py2ls/ec2ls.py +61 -0
- py2ls/ips.py +105 -55
- py2ls/ml2ls.py +244 -110
- py2ls/nl2ls.py +283 -0
- py2ls/plot.py +351 -40
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/RECORD +9 -8
- py2ls/ml2ls copy.py +0 -2906
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/WHEEL +0 -0
py2ls/.git/index
CHANGED
Binary file
|
py2ls/ec2ls.py
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
def get_trend(
|
2
|
+
keywords: list = None, # ["AI", "Python", "Data Science"]
|
3
|
+
timezone: str = "Europe/Berlin", # minutes differ from UTC
|
4
|
+
cat=0,
|
5
|
+
timeframe="today 12-m",
|
6
|
+
geo="DE",
|
7
|
+
gprop="",
|
8
|
+
**kwargs
|
9
|
+
):
|
10
|
+
from pytrends.request import TrendReq
|
11
|
+
from pytrends.exceptions import TooManyRequestsError
|
12
|
+
import pytz
|
13
|
+
from datetime import datetime
|
14
|
+
import time
|
15
|
+
import requests
|
16
|
+
from urllib3.util.retry import Retry
|
17
|
+
|
18
|
+
if isinstance(timezone, str):
|
19
|
+
stadt = pytz.timezone(timezone)
|
20
|
+
current_time = datetime.now(stadt) # This will be timezone-aware
|
21
|
+
# Convert the timezone-aware datetime to naive UTC datetime
|
22
|
+
naive_time = current_time.astimezone(pytz.utc).replace(tzinfo=None)
|
23
|
+
tz_offset = stadt.utcoffset(naive_time).seconds // 60 # in minutes
|
24
|
+
elif isinstance(timezone, int):
|
25
|
+
tz_offset = timezone
|
26
|
+
|
27
|
+
# Initialize TrendReq with correct timezone offset
|
28
|
+
pytrends = TrendReq(hl="en-US", tz=tz_offset )
|
29
|
+
|
30
|
+
# Ensure that keywords are in list form
|
31
|
+
if isinstance(keywords, str):
|
32
|
+
keywords = [keywords]
|
33
|
+
|
34
|
+
pytrends.build_payload(keywords, cat=cat, timeframe=timeframe, geo=geo, gprop=gprop)
|
35
|
+
|
36
|
+
res = {}
|
37
|
+
# Try fetching data with error handling
|
38
|
+
for func_name, fetch_func in [
|
39
|
+
("interest_over_time", pytrends.interest_over_time),
|
40
|
+
("related_topics", pytrends.related_topics),
|
41
|
+
("related_queries", pytrends.related_queries),
|
42
|
+
("categories", pytrends.categories)
|
43
|
+
]:
|
44
|
+
try:
|
45
|
+
print(f"Fetching {func_name}...")
|
46
|
+
res[func_name] = fetch_func()
|
47
|
+
print(f"done: {func_name}")
|
48
|
+
except TooManyRequestsError:
|
49
|
+
print(f"Too many requests error for {func_name}. Retrying...")
|
50
|
+
time.sleep(5) # Delay to avoid spamming the server
|
51
|
+
if retries > 0:
|
52
|
+
return get_trend(keywords, timezone, cat, timeframe, geo, gprop, retries=retries-1)
|
53
|
+
res[func_name] = None
|
54
|
+
except requests.exceptions.RequestException as e:
|
55
|
+
print(f"Request error for {func_name}: {e}")
|
56
|
+
res[func_name] = None
|
57
|
+
except Exception as e:
|
58
|
+
print(f"Error fetching {func_name}: {e}")
|
59
|
+
res[func_name] = None
|
60
|
+
|
61
|
+
return res
|
py2ls/ips.py
CHANGED
@@ -18,15 +18,17 @@ warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
|
18
18
|
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
19
19
|
|
20
20
|
|
21
|
-
def run_once_within(duration=60): # default 60s
|
21
|
+
def run_once_within(duration=60,reverse=False): # default 60s
|
22
22
|
import time
|
23
23
|
|
24
24
|
"""
|
25
|
+
如果reverse is True, 则在第一次运行时并不运行.但是在第二次运行时则运行
|
25
26
|
usage:
|
26
27
|
if run_once_within():
|
27
28
|
print("This code runs once per minute.")
|
28
29
|
else:
|
29
30
|
print("The code has already been run in the last minute.")
|
31
|
+
|
30
32
|
"""
|
31
33
|
if not hasattr(run_once_within, "time_last"):
|
32
34
|
run_once_within.time_last = None
|
@@ -36,9 +38,9 @@ def run_once_within(duration=60): # default 60s
|
|
36
38
|
time_curr - run_once_within.time_last >= duration
|
37
39
|
):
|
38
40
|
run_once_within.time_last = time_curr # Update the last execution time
|
39
|
-
return True
|
41
|
+
return False if reverse else True
|
40
42
|
else:
|
41
|
-
return False
|
43
|
+
return True if reverse else False
|
42
44
|
|
43
45
|
|
44
46
|
def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
@@ -3497,12 +3499,8 @@ def figsave(*args, dpi=300):
|
|
3497
3499
|
)
|
3498
3500
|
else:
|
3499
3501
|
plt.savefig(
|
3500
|
-
fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", pad_inches=0
|
3501
|
-
)
|
3502
|
-
# elif ftype.lower() == "png":
|
3503
|
-
# plt.savefig(fname, format="png", dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0)
|
3504
|
-
# elif ftype.lower() in ["tiff", "tif"]:
|
3505
|
-
# plt.savefig(fname, format="tiff", dpi=dpi, bbox_inches="tight",pad_inches=0)
|
3502
|
+
fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0
|
3503
|
+
)
|
3506
3504
|
elif ftype.lower() == "emf":
|
3507
3505
|
plt.savefig(fname, format="emf", dpi=dpi, bbox_inches="tight", pad_inches=0)
|
3508
3506
|
elif ftype.lower() == "fig":
|
@@ -5230,16 +5228,16 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
|
5230
5228
|
data = data.explode(column, ignore_index=True)
|
5231
5229
|
return data
|
5232
5230
|
|
5233
|
-
def
|
5231
|
+
def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
|
5234
5232
|
"""
|
5235
5233
|
Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
|
5236
5234
|
Usage:
|
5237
5235
|
data = pd.DataFrame({'month': [1, 4, 7, 10, 12]}) # Just months as an example
|
5238
|
-
#
|
5239
|
-
data =
|
5236
|
+
# df_cycle month cyclically
|
5237
|
+
data = df_cycle(data, 'month', 12)
|
5240
5238
|
"""
|
5241
5239
|
if columns is None:
|
5242
|
-
columns = list(data.columns) # If no columns specified, use all columns
|
5240
|
+
columns = list(data.select_dtypes(include=np.number).columns) # If no columns specified, use all columns
|
5243
5241
|
if max_val is None:
|
5244
5242
|
max_val = np.max(data[columns]) # If no max_val specified, use the maximum value across all columns
|
5245
5243
|
if isinstance(columns, str):
|
@@ -5424,7 +5422,7 @@ def df_astype(
|
|
5424
5422
|
# print(f"Successfully converted '{column}' to timedelta.")
|
5425
5423
|
elif astype == "circular":
|
5426
5424
|
max_val = kwargs.get('max_val',None)
|
5427
|
-
data[column]=
|
5425
|
+
data[column]=df_cycle(data=data,columns=column,max_val=max_val)
|
5428
5426
|
else:
|
5429
5427
|
# Convert to other types (e.g., float, int)
|
5430
5428
|
if astype=='int':
|
@@ -5910,11 +5908,13 @@ def df_encoder(
|
|
5910
5908
|
|
5911
5909
|
def df_scaler(
|
5912
5910
|
data: pd.DataFrame, # should be numeric dtype
|
5911
|
+
scaler=None,
|
5913
5912
|
method="standard",
|
5914
5913
|
columns=None, # default, select all numeric col/row
|
5915
5914
|
inplace=False,
|
5916
5915
|
verbose=False, # show usage
|
5917
5916
|
axis=0, # defalut column-wise
|
5917
|
+
return_scaler:bool=False,# True: return both: return df, scaler
|
5918
5918
|
**kwargs,
|
5919
5919
|
):
|
5920
5920
|
"""
|
@@ -5932,31 +5932,49 @@ def df_scaler(
|
|
5932
5932
|
"""
|
5933
5933
|
if verbose:
|
5934
5934
|
print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
|
5935
|
-
|
5936
|
-
|
5937
|
-
|
5938
|
-
|
5939
|
-
|
5940
|
-
|
5941
|
-
|
5942
|
-
|
5943
|
-
|
5944
|
-
|
5945
|
-
|
5946
|
-
|
5947
|
-
|
5948
|
-
|
5949
|
-
|
5950
|
-
|
5951
|
-
|
5952
|
-
|
5935
|
+
if scaler is None:
|
5936
|
+
methods = ["standard", "minmax", "robust","maxabs"]
|
5937
|
+
method = strcmp(method, methods)[0]
|
5938
|
+
if method == "standard":
|
5939
|
+
from sklearn.preprocessing import StandardScaler
|
5940
|
+
if verbose:
|
5941
|
+
print("performs z-score normalization: This will standardize each feature to have a mean of 0 and a standard deviation of 1.")
|
5942
|
+
print("Use when the data is approximately normally distributed (Gaussian).\nWorks well with algorithms sensitive to feature distribution, such as SVMs, linear regression, logistic regression, and neural networks.")
|
5943
|
+
scaler = StandardScaler(**kwargs)
|
5944
|
+
elif method == "minmax":
|
5945
|
+
from sklearn.preprocessing import MinMaxScaler
|
5946
|
+
if verbose:
|
5947
|
+
print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
|
5948
|
+
print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
|
5949
|
+
print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
|
5950
|
+
scaler = MinMaxScaler(**kwargs)
|
5951
|
+
elif method == "robust":
|
5952
|
+
from sklearn.preprocessing import RobustScaler
|
5953
|
+
if verbose:
|
5954
|
+
print("scales the data based on the median and interquartile range, which is robust to outliers.")
|
5955
|
+
print("Use when the dataset contains outliers.\nThis method is useful because it scales based on the median and the interquartile range (IQR), which are more robust to outliers than the mean and standard deviation.")
|
5956
|
+
scaler = RobustScaler(**kwargs)
|
5957
|
+
elif method=="maxabs":
|
5958
|
+
from sklearn.preprocessing import MaxAbsScaler
|
5959
|
+
if verbose:
|
5960
|
+
print("This scales each feature by its maximum absolute value, resulting in values within the range [-1, 1] for each feature.")
|
5961
|
+
print("Use for data that is already sparse or when features have positive or negative values that need scaling without shifting the data.\nOften used with sparse data (data with many zeros), where preserving zero entries is essential, such as in text data or recommendation systems.")
|
5962
|
+
scaler = MaxAbsScaler(**kwargs)
|
5963
|
+
if axis not in [0, 1]:
|
5964
|
+
raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
|
5965
|
+
if verbose:
|
5966
|
+
print(scaler)
|
5953
5967
|
if axis == 0:
|
5954
5968
|
# Column-wise scaling (default)
|
5955
5969
|
if columns is None:
|
5956
5970
|
columns = data.select_dtypes(include=np.number).columns.tolist()
|
5957
5971
|
non_numeric_columns = data.columns.difference(columns)
|
5958
5972
|
|
5959
|
-
scaled_data = scaler.fit_transform(data[columns])
|
5973
|
+
# scaled_data = scaler.fit_transform(data[columns])
|
5974
|
+
if scaler is None or not hasattr(scaler, 'mean_'):
|
5975
|
+
scaled_data = scaler.fit_transform(data[columns])
|
5976
|
+
else:
|
5977
|
+
scaled_data = scaler.transform(data[columns])
|
5960
5978
|
|
5961
5979
|
if inplace:
|
5962
5980
|
data[columns] = scaled_data
|
@@ -5970,7 +5988,10 @@ def df_scaler(
|
|
5970
5988
|
axis=1,
|
5971
5989
|
)
|
5972
5990
|
scaled_df = scaled_df[data.columns] # Maintain column order
|
5973
|
-
|
5991
|
+
if return_scaler:
|
5992
|
+
return scaled_df,scaler
|
5993
|
+
else:
|
5994
|
+
return scaled_df
|
5974
5995
|
|
5975
5996
|
elif axis == 1:
|
5976
5997
|
# Row-wise scaling
|
@@ -5982,9 +6003,10 @@ def df_scaler(
|
|
5982
6003
|
|
5983
6004
|
print(f"Scaling rows")
|
5984
6005
|
|
5985
|
-
scaled_data = scaler.fit_transform(
|
5986
|
-
|
5987
|
-
).T # Transpose for scaling and then back
|
6006
|
+
# scaled_data = scaler.fit_transform(
|
6007
|
+
# numeric_rows.T
|
6008
|
+
# ).T # Transpose for scaling and then back
|
6009
|
+
scaled_data = scaler.fit_transform(numeric_rows.T).T if scaler is None or not hasattr(scaler, 'mean_') else scaler.transform(numeric_rows.T).T
|
5988
6010
|
|
5989
6011
|
if inplace:
|
5990
6012
|
data.loc[numeric_rows.index] = scaled_data
|
@@ -5992,7 +6014,10 @@ def df_scaler(
|
|
5992
6014
|
else:
|
5993
6015
|
scaled_df = data.copy()
|
5994
6016
|
scaled_df.loc[numeric_rows.index] = scaled_data
|
5995
|
-
|
6017
|
+
if return_scaler:
|
6018
|
+
return scaled_df,scaler
|
6019
|
+
else:
|
6020
|
+
return scaled_df
|
5996
6021
|
|
5997
6022
|
|
5998
6023
|
def df_special_characters_cleaner(
|
@@ -7590,21 +7615,18 @@ def df_qc(
|
|
7590
7615
|
for warning in res_qc["warnings"]:
|
7591
7616
|
print(" -", warning)
|
7592
7617
|
if plot_:
|
7593
|
-
df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue)
|
7594
|
-
|
7595
|
-
try:
|
7596
|
-
figsave(dir_save)
|
7597
|
-
except Exception as e:
|
7598
|
-
print(f"⚠️: {e}")
|
7599
|
-
if output:
|
7618
|
+
df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
|
7619
|
+
if output or not plot_:
|
7600
7620
|
return res_qc
|
7601
7621
|
return None
|
7602
7622
|
|
7603
7623
|
|
7604
|
-
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None):
|
7624
|
+
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None,dir_save=None):
|
7605
7625
|
import matplotlib.pyplot as plt
|
7606
7626
|
import seaborn as sns
|
7607
7627
|
from .plot import subplot, figsets, get_color
|
7628
|
+
from datetime import datetime
|
7629
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
7608
7630
|
|
7609
7631
|
if columns is not None:
|
7610
7632
|
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
@@ -7638,15 +7660,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7638
7660
|
ax=nexttile(),
|
7639
7661
|
)
|
7640
7662
|
figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
|
7641
|
-
|
7642
|
-
#!
|
7643
|
-
try:
|
7644
|
-
if data.select_dtypes(include=np.number).shape[1]<=10:
|
7645
|
-
for col in data.select_dtypes(include=np.number).columns:
|
7646
|
-
sns.histplot(data[col], kde=True, bins=30, ax=nexttile())
|
7647
|
-
figsets(title=f"Distribution: {col}", xlabel=col, ylabel="Frequency")
|
7648
|
-
except:
|
7649
|
-
pass
|
7663
|
+
|
7650
7664
|
#!
|
7651
7665
|
try:
|
7652
7666
|
for col in data.select_dtypes(include='category').columns:
|
@@ -7857,7 +7871,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
7857
7871
|
title="Correlation Heatmap",
|
7858
7872
|
ax=ax_heatmap
|
7859
7873
|
)
|
7874
|
+
# save figure
|
7875
|
+
if dir_save:
|
7876
|
+
figsave(dir_save,f"qc_plot_{now_}.pdf")
|
7860
7877
|
|
7878
|
+
if columns is not None:
|
7879
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7880
|
+
data=data[columns]
|
7881
|
+
len_total = len(res_qc)
|
7882
|
+
n_row, n_col = int((len_total + 10) / 3), 3
|
7883
|
+
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
7884
|
+
#! check distribution
|
7885
|
+
data_num = data.select_dtypes(include=np.number)
|
7886
|
+
if len(data_num) > max_cols:
|
7887
|
+
data_num = data_num.iloc[:,:max_cols]
|
7888
|
+
|
7889
|
+
data_num = df_scaler(data=data_num, method='standard')
|
7890
|
+
|
7891
|
+
import scipy.stats as stats
|
7892
|
+
for column in data_num.columns:
|
7893
|
+
#* Shapiro-Wilk test for normality
|
7894
|
+
stat, p_value = stats.shapiro(data_num[column])
|
7895
|
+
normality = "norm" if p_value > 0.05 else "not_norm"
|
7896
|
+
#* Plot histogram
|
7897
|
+
ax_hist=sns.histplot(data_num[column], kde=True, ax=nexttile())
|
7898
|
+
x_min, x_max = ax_hist.get_xlim()
|
7899
|
+
y_min, y_max = ax_hist.get_ylim()
|
7900
|
+
ax_hist.text(x_min+(x_max-x_min)*0.5, y_min+(y_max-y_min)*0.75,
|
7901
|
+
f'p(Shapiro-Wilk)={p_value:.3f}\n{normality}',
|
7902
|
+
ha='center', va='top')
|
7903
|
+
figsets(title=column,ax=ax_hist)
|
7904
|
+
ax_twin=ax_hist.twinx()
|
7905
|
+
#* Q-Q plot
|
7906
|
+
stats.probplot(data_num[column], dist="norm", plot=ax_twin)
|
7907
|
+
figsets(ylabel=f'Q-Q Plot:{column}',title=None)
|
7908
|
+
# save figure
|
7909
|
+
if dir_save:
|
7910
|
+
figsave(dir_save,f"qq_plot_{now_}.pdf")
|
7861
7911
|
def use_pd(
|
7862
7912
|
func_name="excel",
|
7863
7913
|
verbose=True,
|